azure-ai-evaluation 1.8.0__py3-none-any.whl → 1.10.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of azure-ai-evaluation might be problematic. Click here for more details.
- azure/ai/evaluation/__init__.py +51 -6
- azure/ai/evaluation/_aoai/__init__.py +1 -1
- azure/ai/evaluation/_aoai/aoai_grader.py +21 -11
- azure/ai/evaluation/_aoai/label_grader.py +3 -2
- azure/ai/evaluation/_aoai/python_grader.py +84 -0
- azure/ai/evaluation/_aoai/score_model_grader.py +91 -0
- azure/ai/evaluation/_aoai/string_check_grader.py +3 -2
- azure/ai/evaluation/_aoai/text_similarity_grader.py +3 -2
- azure/ai/evaluation/_azure/_envs.py +9 -10
- azure/ai/evaluation/_azure/_token_manager.py +7 -1
- azure/ai/evaluation/_common/constants.py +11 -2
- azure/ai/evaluation/_common/evaluation_onedp_client.py +32 -26
- azure/ai/evaluation/_common/onedp/__init__.py +32 -32
- azure/ai/evaluation/_common/onedp/_client.py +136 -139
- azure/ai/evaluation/_common/onedp/_configuration.py +70 -73
- azure/ai/evaluation/_common/onedp/_patch.py +21 -21
- azure/ai/evaluation/_common/onedp/_utils/__init__.py +6 -0
- azure/ai/evaluation/_common/onedp/_utils/model_base.py +1232 -0
- azure/ai/evaluation/_common/onedp/_utils/serialization.py +2032 -0
- azure/ai/evaluation/_common/onedp/_validation.py +50 -50
- azure/ai/evaluation/_common/onedp/_version.py +9 -9
- azure/ai/evaluation/_common/onedp/aio/__init__.py +29 -29
- azure/ai/evaluation/_common/onedp/aio/_client.py +138 -143
- azure/ai/evaluation/_common/onedp/aio/_configuration.py +70 -75
- azure/ai/evaluation/_common/onedp/aio/_patch.py +21 -21
- azure/ai/evaluation/_common/onedp/aio/operations/__init__.py +37 -39
- azure/ai/evaluation/_common/onedp/aio/operations/_operations.py +4832 -4494
- azure/ai/evaluation/_common/onedp/aio/operations/_patch.py +21 -21
- azure/ai/evaluation/_common/onedp/models/__init__.py +168 -142
- azure/ai/evaluation/_common/onedp/models/_enums.py +230 -162
- azure/ai/evaluation/_common/onedp/models/_models.py +2685 -2228
- azure/ai/evaluation/_common/onedp/models/_patch.py +21 -21
- azure/ai/evaluation/_common/onedp/operations/__init__.py +37 -39
- azure/ai/evaluation/_common/onedp/operations/_operations.py +6106 -5657
- azure/ai/evaluation/_common/onedp/operations/_patch.py +21 -21
- azure/ai/evaluation/_common/rai_service.py +88 -52
- azure/ai/evaluation/_common/raiclient/__init__.py +1 -1
- azure/ai/evaluation/_common/raiclient/operations/_operations.py +14 -1
- azure/ai/evaluation/_common/utils.py +188 -10
- azure/ai/evaluation/_constants.py +2 -1
- azure/ai/evaluation/_converters/__init__.py +1 -1
- azure/ai/evaluation/_converters/_ai_services.py +9 -8
- azure/ai/evaluation/_converters/_models.py +46 -0
- azure/ai/evaluation/_converters/_sk_services.py +495 -0
- azure/ai/evaluation/_eval_mapping.py +2 -2
- azure/ai/evaluation/_evaluate/_batch_run/_run_submitter_client.py +73 -25
- azure/ai/evaluation/_evaluate/_batch_run/eval_run_context.py +2 -2
- azure/ai/evaluation/_evaluate/_evaluate.py +210 -94
- azure/ai/evaluation/_evaluate/_evaluate_aoai.py +132 -89
- azure/ai/evaluation/_evaluate/_telemetry/__init__.py +0 -1
- azure/ai/evaluation/_evaluate/_utils.py +25 -17
- azure/ai/evaluation/_evaluators/_bleu/_bleu.py +4 -4
- azure/ai/evaluation/_evaluators/_code_vulnerability/_code_vulnerability.py +20 -12
- azure/ai/evaluation/_evaluators/_coherence/_coherence.py +6 -6
- azure/ai/evaluation/_evaluators/_common/_base_eval.py +45 -11
- azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +24 -9
- azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py +24 -9
- azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py +28 -18
- azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py +11 -8
- azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py +11 -8
- azure/ai/evaluation/_evaluators/_content_safety/_sexual.py +12 -9
- azure/ai/evaluation/_evaluators/_content_safety/_violence.py +10 -7
- azure/ai/evaluation/_evaluators/_document_retrieval/__init__.py +1 -5
- azure/ai/evaluation/_evaluators/_document_retrieval/_document_retrieval.py +37 -64
- azure/ai/evaluation/_evaluators/_eci/_eci.py +6 -3
- azure/ai/evaluation/_evaluators/_f1_score/_f1_score.py +5 -5
- azure/ai/evaluation/_evaluators/_fluency/_fluency.py +3 -3
- azure/ai/evaluation/_evaluators/_gleu/_gleu.py +4 -4
- azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +12 -8
- azure/ai/evaluation/_evaluators/_intent_resolution/_intent_resolution.py +31 -26
- azure/ai/evaluation/_evaluators/_intent_resolution/intent_resolution.prompty +210 -96
- azure/ai/evaluation/_evaluators/_meteor/_meteor.py +3 -4
- azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py +14 -7
- azure/ai/evaluation/_evaluators/_qa/_qa.py +5 -5
- azure/ai/evaluation/_evaluators/_relevance/_relevance.py +62 -15
- azure/ai/evaluation/_evaluators/_relevance/relevance.prompty +140 -59
- azure/ai/evaluation/_evaluators/_response_completeness/_response_completeness.py +21 -26
- azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py +5 -5
- azure/ai/evaluation/_evaluators/_rouge/_rouge.py +22 -22
- azure/ai/evaluation/_evaluators/_service_groundedness/_service_groundedness.py +7 -6
- azure/ai/evaluation/_evaluators/_similarity/_similarity.py +4 -4
- azure/ai/evaluation/_evaluators/_task_adherence/_task_adherence.py +27 -24
- azure/ai/evaluation/_evaluators/_task_adherence/task_adherence.prompty +354 -66
- azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py +175 -183
- azure/ai/evaluation/_evaluators/_tool_call_accuracy/tool_call_accuracy.prompty +99 -21
- azure/ai/evaluation/_evaluators/_ungrounded_attributes/_ungrounded_attributes.py +20 -12
- azure/ai/evaluation/_evaluators/_xpia/xpia.py +10 -7
- azure/ai/evaluation/_exceptions.py +10 -0
- azure/ai/evaluation/_http_utils.py +3 -3
- azure/ai/evaluation/_legacy/_batch_engine/_config.py +6 -3
- azure/ai/evaluation/_legacy/_batch_engine/_engine.py +117 -32
- azure/ai/evaluation/_legacy/_batch_engine/_openai_injector.py +5 -2
- azure/ai/evaluation/_legacy/_batch_engine/_result.py +2 -0
- azure/ai/evaluation/_legacy/_batch_engine/_run.py +2 -2
- azure/ai/evaluation/_legacy/_batch_engine/_run_submitter.py +33 -41
- azure/ai/evaluation/_legacy/_batch_engine/_utils.py +1 -4
- azure/ai/evaluation/_legacy/_common/_async_token_provider.py +12 -19
- azure/ai/evaluation/_legacy/_common/_thread_pool_executor_with_context.py +2 -0
- azure/ai/evaluation/_legacy/prompty/_prompty.py +11 -5
- azure/ai/evaluation/_safety_evaluation/__init__.py +1 -1
- azure/ai/evaluation/_safety_evaluation/_safety_evaluation.py +195 -111
- azure/ai/evaluation/_user_agent.py +32 -1
- azure/ai/evaluation/_version.py +1 -1
- azure/ai/evaluation/red_team/__init__.py +3 -1
- azure/ai/evaluation/red_team/_agent/__init__.py +1 -1
- azure/ai/evaluation/red_team/_agent/_agent_functions.py +68 -71
- azure/ai/evaluation/red_team/_agent/_agent_tools.py +103 -145
- azure/ai/evaluation/red_team/_agent/_agent_utils.py +26 -6
- azure/ai/evaluation/red_team/_agent/_semantic_kernel_plugin.py +62 -71
- azure/ai/evaluation/red_team/_attack_objective_generator.py +94 -52
- azure/ai/evaluation/red_team/_attack_strategy.py +2 -1
- azure/ai/evaluation/red_team/_callback_chat_target.py +4 -9
- azure/ai/evaluation/red_team/_default_converter.py +1 -1
- azure/ai/evaluation/red_team/_red_team.py +1947 -1040
- azure/ai/evaluation/red_team/_red_team_result.py +49 -38
- azure/ai/evaluation/red_team/_utils/__init__.py +1 -1
- azure/ai/evaluation/red_team/_utils/_rai_service_eval_chat_target.py +39 -34
- azure/ai/evaluation/red_team/_utils/_rai_service_target.py +163 -138
- azure/ai/evaluation/red_team/_utils/_rai_service_true_false_scorer.py +14 -14
- azure/ai/evaluation/red_team/_utils/constants.py +1 -13
- azure/ai/evaluation/red_team/_utils/formatting_utils.py +41 -44
- azure/ai/evaluation/red_team/_utils/logging_utils.py +17 -17
- azure/ai/evaluation/red_team/_utils/metric_mapping.py +31 -4
- azure/ai/evaluation/red_team/_utils/strategy_utils.py +33 -25
- azure/ai/evaluation/simulator/_adversarial_scenario.py +2 -0
- azure/ai/evaluation/simulator/_adversarial_simulator.py +31 -17
- azure/ai/evaluation/simulator/_conversation/__init__.py +2 -2
- azure/ai/evaluation/simulator/_direct_attack_simulator.py +8 -8
- azure/ai/evaluation/simulator/_indirect_attack_simulator.py +18 -6
- azure/ai/evaluation/simulator/_model_tools/_generated_rai_client.py +54 -24
- azure/ai/evaluation/simulator/_model_tools/_identity_manager.py +7 -1
- azure/ai/evaluation/simulator/_model_tools/_proxy_completion_model.py +30 -10
- azure/ai/evaluation/simulator/_model_tools/_rai_client.py +19 -31
- azure/ai/evaluation/simulator/_model_tools/_template_handler.py +20 -6
- azure/ai/evaluation/simulator/_model_tools/models.py +1 -1
- azure/ai/evaluation/simulator/_simulator.py +21 -8
- {azure_ai_evaluation-1.8.0.dist-info → azure_ai_evaluation-1.10.0.dist-info}/METADATA +46 -3
- {azure_ai_evaluation-1.8.0.dist-info → azure_ai_evaluation-1.10.0.dist-info}/RECORD +141 -136
- azure/ai/evaluation/_common/onedp/aio/_vendor.py +0 -40
- {azure_ai_evaluation-1.8.0.dist-info → azure_ai_evaluation-1.10.0.dist-info}/NOTICE.txt +0 -0
- {azure_ai_evaluation-1.8.0.dist-info → azure_ai_evaluation-1.10.0.dist-info}/WHEEL +0 -0
- {azure_ai_evaluation-1.8.0.dist-info → azure_ai_evaluation-1.10.0.dist-info}/top_level.txt +0 -0
|
@@ -8,27 +8,28 @@ from azure.ai.evaluation._common._experimental import experimental
|
|
|
8
8
|
from azure.ai.evaluation._common.constants import EvaluationMetrics
|
|
9
9
|
from azure.ai.evaluation._evaluators._common import RaiServiceEvaluatorBase
|
|
10
10
|
|
|
11
|
+
|
|
11
12
|
@experimental
|
|
12
13
|
class UngroundedAttributesEvaluator(RaiServiceEvaluatorBase[Union[str, bool]]):
|
|
13
14
|
"""
|
|
14
|
-
Evaluates ungrounded inference of human attributes for a given query, response, and context for a single-turn evaluation only,
|
|
15
|
-
where query represents the user query and response represents the AI system response given the provided context.
|
|
16
|
-
|
|
17
|
-
Ungrounded Attributes checks for whether a response is first, ungrounded, and checks if it contains information about protected class or
|
|
15
|
+
Evaluates ungrounded inference of human attributes for a given query, response, and context for a single-turn evaluation only,
|
|
16
|
+
where query represents the user query and response represents the AI system response given the provided context.
|
|
17
|
+
|
|
18
|
+
Ungrounded Attributes checks for whether a response is first, ungrounded, and checks if it contains information about protected class or
|
|
18
19
|
emotional state of a person.
|
|
19
20
|
|
|
20
21
|
|
|
21
22
|
It identifies the following attributes:
|
|
22
|
-
|
|
23
|
+
|
|
23
24
|
- emotional_state
|
|
24
25
|
- protected_class
|
|
25
26
|
- groundedness
|
|
26
27
|
|
|
27
28
|
:param credential: The credential for connecting to Azure AI project. Required
|
|
28
29
|
:type credential: ~azure.core.credentials.TokenCredential
|
|
29
|
-
:param azure_ai_project: The
|
|
30
|
-
It contains subscription id, resource group, and project name.
|
|
31
|
-
:type azure_ai_project: ~azure.ai.evaluation.AzureAIProject
|
|
30
|
+
:param azure_ai_project: The Azure AI project, which can either be a string representing the project endpoint
|
|
31
|
+
or an instance of AzureAIProject. It contains subscription id, resource group, and project name.
|
|
32
|
+
:type azure_ai_project: Union[str, ~azure.ai.evaluation.AzureAIProject]
|
|
32
33
|
:param kwargs: Additional arguments to pass to the evaluator.
|
|
33
34
|
:type kwargs: Any
|
|
34
35
|
|
|
@@ -42,13 +43,13 @@ class UngroundedAttributesEvaluator(RaiServiceEvaluatorBase[Union[str, bool]]):
|
|
|
42
43
|
:caption: Initialize and call a UngroundedAttributesEvaluator with a query, response and context.
|
|
43
44
|
|
|
44
45
|
.. admonition:: Example using Azure AI Project URL:
|
|
45
|
-
|
|
46
|
+
|
|
46
47
|
.. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
|
|
47
48
|
:start-after: [START ungrounded_attributes_evaluator]
|
|
48
49
|
:end-before: [END ungrounded_attributes_evaluator]
|
|
49
50
|
:language: python
|
|
50
51
|
:dedent: 8
|
|
51
|
-
:caption: Initialize and call UngroundedAttributesEvaluator using Azure AI Project URL in the following format
|
|
52
|
+
:caption: Initialize and call UngroundedAttributesEvaluator using Azure AI Project URL in the following format
|
|
52
53
|
https://{resource_name}.services.ai.azure.com/api/projects/{project_name}
|
|
53
54
|
|
|
54
55
|
.. note::
|
|
@@ -57,19 +58,26 @@ class UngroundedAttributesEvaluator(RaiServiceEvaluatorBase[Union[str, bool]]):
|
|
|
57
58
|
for the ungrounded attributes will be "ungrounded_attributes_label".
|
|
58
59
|
"""
|
|
59
60
|
|
|
60
|
-
id = "ungrounded_attributes"
|
|
61
|
+
id = "azureai://built-in/evaluators/ungrounded_attributes"
|
|
61
62
|
"""Evaluator identifier, experimental and to be used only with evaluation in cloud."""
|
|
63
|
+
_OPTIONAL_PARAMS = ["query"]
|
|
62
64
|
|
|
63
65
|
@override
|
|
64
66
|
def __init__(
|
|
65
67
|
self,
|
|
66
68
|
credential,
|
|
67
69
|
azure_ai_project,
|
|
70
|
+
**kwargs,
|
|
68
71
|
):
|
|
72
|
+
# Set default for evaluate_query if not provided
|
|
73
|
+
if "evaluate_query" not in kwargs:
|
|
74
|
+
kwargs["evaluate_query"] = True
|
|
75
|
+
|
|
69
76
|
super().__init__(
|
|
70
77
|
eval_metric=EvaluationMetrics.UNGROUNDED_ATTRIBUTES,
|
|
71
78
|
azure_ai_project=azure_ai_project,
|
|
72
79
|
credential=credential,
|
|
80
|
+
**kwargs,
|
|
73
81
|
)
|
|
74
82
|
|
|
75
83
|
@overload
|
|
@@ -109,5 +117,5 @@ class UngroundedAttributesEvaluator(RaiServiceEvaluatorBase[Union[str, bool]]):
|
|
|
109
117
|
:return: The ungrounded attributes label.
|
|
110
118
|
:rtype: Dict[str, Union[str, bool]]
|
|
111
119
|
"""
|
|
112
|
-
|
|
120
|
+
|
|
113
121
|
return super().__call__(*args, **kwargs)
|
|
@@ -40,9 +40,9 @@ class IndirectAttackEvaluator(RaiServiceEvaluatorBase[Union[str, bool]]):
|
|
|
40
40
|
|
|
41
41
|
:param credential: The credential for connecting to Azure AI project. Required
|
|
42
42
|
:type credential: ~azure.core.credentials.TokenCredential
|
|
43
|
-
:param azure_ai_project: The
|
|
44
|
-
name.
|
|
45
|
-
:type azure_ai_project: ~azure.ai.evaluation.AzureAIProject
|
|
43
|
+
:param azure_ai_project: The Azure AI project, which can either be a string representing the project endpoint
|
|
44
|
+
or an instance of AzureAIProject. It contains subscription id, resource group, and project name.
|
|
45
|
+
:type azure_ai_project: Union[str, ~azure.ai.evaluation.AzureAIProject]
|
|
46
46
|
:param threshold: The threshold for the IndirectAttack evaluator. Default is 0.
|
|
47
47
|
:type threshold: int
|
|
48
48
|
|
|
@@ -54,32 +54,35 @@ class IndirectAttackEvaluator(RaiServiceEvaluatorBase[Union[str, bool]]):
|
|
|
54
54
|
:language: python
|
|
55
55
|
:dedent: 8
|
|
56
56
|
:caption: Initialize and call an IndirectAttackEvaluator.
|
|
57
|
-
|
|
57
|
+
|
|
58
58
|
.. admonition:: Example using Azure AI Project URL:
|
|
59
|
-
|
|
59
|
+
|
|
60
60
|
.. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
|
|
61
61
|
:start-after: [START indirect_attack_evaluator]
|
|
62
62
|
:end-before: [END indirect_attack_evaluator]
|
|
63
63
|
:language: python
|
|
64
64
|
:dedent: 8
|
|
65
|
-
:caption: Initialize and call IndirectAttackEvaluator using Azure AI Project URL in the following format
|
|
65
|
+
:caption: Initialize and call IndirectAttackEvaluator using Azure AI Project URL in the following format
|
|
66
66
|
https://{resource_name}.services.ai.azure.com/api/projects/{project_name}
|
|
67
67
|
|
|
68
68
|
"""
|
|
69
69
|
|
|
70
|
-
id = "
|
|
70
|
+
id = "azureai://built-in/evaluators/indirect_attack"
|
|
71
71
|
"""Evaluator identifier, experimental and to be used only with evaluation in cloud."""
|
|
72
|
+
_OPTIONAL_PARAMS = ["query"]
|
|
72
73
|
|
|
73
74
|
@override
|
|
74
75
|
def __init__(
|
|
75
76
|
self,
|
|
76
77
|
credential,
|
|
77
78
|
azure_ai_project,
|
|
79
|
+
**kwargs,
|
|
78
80
|
):
|
|
79
81
|
super().__init__(
|
|
80
82
|
eval_metric=EvaluationMetrics.XPIA,
|
|
81
83
|
azure_ai_project=azure_ai_project,
|
|
82
84
|
credential=credential,
|
|
85
|
+
**kwargs,
|
|
83
86
|
)
|
|
84
87
|
|
|
85
88
|
@overload
|
|
@@ -9,6 +9,15 @@ from typing import Optional
|
|
|
9
9
|
from azure.core.exceptions import AzureError
|
|
10
10
|
|
|
11
11
|
|
|
12
|
+
class ErrorMessage(Enum):
|
|
13
|
+
"""Error messages to be used when raising EvaluationException.
|
|
14
|
+
|
|
15
|
+
These messages are used to provide a consistent error message format across the SDK.
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
MALFORMED_CONVERSATION_HISTORY = "Malformed Conversation History: Query parameter representing conversation history should have exactly one more user query than agent responses"
|
|
19
|
+
|
|
20
|
+
|
|
12
21
|
class ErrorCategory(Enum):
|
|
13
22
|
"""Error category to be specified when using EvaluationException class.
|
|
14
23
|
|
|
@@ -87,6 +96,7 @@ class ErrorTarget(Enum):
|
|
|
87
96
|
TOOL_CALL_ACCURACY_EVALUATOR = "ToolCallAccuracyEvaluator"
|
|
88
97
|
RED_TEAM = "RedTeam"
|
|
89
98
|
AOAI_GRADER = "AoaiGrader"
|
|
99
|
+
CONVERSATION_HISTORY_PARSING = "_get_conversation_history"
|
|
90
100
|
|
|
91
101
|
|
|
92
102
|
class EvaluationException(AzureError):
|
|
@@ -7,7 +7,7 @@ from typing import Any, Dict, MutableMapping, Optional, TypedDict, cast
|
|
|
7
7
|
|
|
8
8
|
from typing_extensions import Self, Unpack
|
|
9
9
|
|
|
10
|
-
from azure.ai.evaluation._user_agent import
|
|
10
|
+
from azure.ai.evaluation._user_agent import UserAgentSingleton
|
|
11
11
|
from azure.core.configuration import Configuration
|
|
12
12
|
from azure.core.pipeline import AsyncPipeline, Pipeline
|
|
13
13
|
from azure.core.pipeline.policies import (
|
|
@@ -454,7 +454,7 @@ def get_http_client(**kwargs: Any) -> HttpPipeline:
|
|
|
454
454
|
:returns: An HttpPipeline with a set of applied policies:
|
|
455
455
|
:rtype: HttpPipeline
|
|
456
456
|
"""
|
|
457
|
-
kwargs.setdefault("user_agent_policy", UserAgentPolicy(base_user_agent=
|
|
457
|
+
kwargs.setdefault("user_agent_policy", UserAgentPolicy(base_user_agent=UserAgentSingleton().value))
|
|
458
458
|
return HttpPipeline(**kwargs)
|
|
459
459
|
|
|
460
460
|
|
|
@@ -464,5 +464,5 @@ def get_async_http_client(**kwargs: Any) -> AsyncHttpPipeline:
|
|
|
464
464
|
:returns: An AsyncHttpPipeline with a set of applied policies:
|
|
465
465
|
:rtype: AsyncHttpPipeline
|
|
466
466
|
"""
|
|
467
|
-
kwargs.setdefault("user_agent_policy", UserAgentPolicy(base_user_agent=
|
|
467
|
+
kwargs.setdefault("user_agent_policy", UserAgentPolicy(base_user_agent=UserAgentSingleton().value))
|
|
468
468
|
return AsyncHttpPipeline(**kwargs)
|
|
@@ -19,7 +19,7 @@ class BatchEngineConfig:
|
|
|
19
19
|
batch_timeout_seconds: int = PF_BATCH_TIMEOUT_SEC_DEFAULT
|
|
20
20
|
"""The maximum amount of time to wait for all evaluations in the batch to complete."""
|
|
21
21
|
|
|
22
|
-
|
|
22
|
+
line_timeout_seconds: int = 600
|
|
23
23
|
"""The maximum amount of time to wait for an evaluation to run against a single entry
|
|
24
24
|
in the data input to complete."""
|
|
25
25
|
|
|
@@ -32,13 +32,16 @@ class BatchEngineConfig:
|
|
|
32
32
|
default_num_results: int = 100
|
|
33
33
|
"""The default number of results to return if you don't ask for all results."""
|
|
34
34
|
|
|
35
|
+
raise_on_error: bool = True
|
|
36
|
+
"""Whether to raise an error if an evaluation fails."""
|
|
37
|
+
|
|
35
38
|
def __post_init__(self):
|
|
36
39
|
if self.logger is None:
|
|
37
40
|
raise ValueError("logger cannot be None")
|
|
38
41
|
if self.batch_timeout_seconds <= 0:
|
|
39
42
|
raise ValueError("batch_timeout_seconds must be greater than 0")
|
|
40
|
-
if self.
|
|
41
|
-
raise ValueError("
|
|
43
|
+
if self.line_timeout_seconds <= 0:
|
|
44
|
+
raise ValueError("line_timeout_seconds must be greater than 0")
|
|
42
45
|
if self.max_concurrency <= 0:
|
|
43
46
|
raise ValueError("max_concurrency must be greater than 0")
|
|
44
47
|
if self.default_num_results <= 0:
|
|
@@ -20,15 +20,31 @@ from concurrent.futures import Executor
|
|
|
20
20
|
from functools import partial
|
|
21
21
|
from contextlib import contextmanager
|
|
22
22
|
from datetime import datetime, timezone
|
|
23
|
-
from typing import
|
|
23
|
+
from typing import (
|
|
24
|
+
Any,
|
|
25
|
+
Callable,
|
|
26
|
+
Dict,
|
|
27
|
+
Final,
|
|
28
|
+
Generator,
|
|
29
|
+
List,
|
|
30
|
+
Mapping,
|
|
31
|
+
MutableMapping,
|
|
32
|
+
Optional,
|
|
33
|
+
Sequence,
|
|
34
|
+
Set,
|
|
35
|
+
Tuple,
|
|
36
|
+
cast,
|
|
37
|
+
Literal,
|
|
38
|
+
)
|
|
24
39
|
from uuid import uuid4
|
|
25
40
|
|
|
41
|
+
from ._config import BatchEngineConfig
|
|
26
42
|
from ._utils import DEFAULTS_KEY, get_int_env_var, get_value_from_path, is_async_callable
|
|
27
43
|
from ._status import BatchStatus
|
|
28
44
|
from ._result import BatchResult, BatchRunDetails, BatchRunError, TokenMetrics
|
|
29
45
|
from ._run_storage import AbstractRunStorage, NoOpRunStorage
|
|
30
|
-
from .._common._logging import log_progress, NodeLogManager
|
|
31
|
-
from ..._exceptions import ErrorBlame
|
|
46
|
+
from .._common._logging import log_progress, logger, NodeLogManager
|
|
47
|
+
from ..._exceptions import ErrorBlame, EvaluationException
|
|
32
48
|
from ._exceptions import (
|
|
33
49
|
BatchEngineCanceledError,
|
|
34
50
|
BatchEngineError,
|
|
@@ -54,30 +70,25 @@ class BatchEngine:
|
|
|
54
70
|
self,
|
|
55
71
|
func: Callable,
|
|
56
72
|
*,
|
|
73
|
+
config: BatchEngineConfig,
|
|
57
74
|
storage: Optional[AbstractRunStorage] = None,
|
|
58
|
-
batch_timeout_sec: Optional[int] = None,
|
|
59
|
-
line_timeout_sec: Optional[int] = None,
|
|
60
|
-
max_worker_count: Optional[int] = None,
|
|
61
75
|
executor: Optional[Executor] = None,
|
|
62
76
|
):
|
|
63
77
|
"""Create a new batch engine instance
|
|
64
78
|
|
|
65
79
|
:param Callable func: The function to run the flow
|
|
80
|
+
:param BatchEngineConfig config: The configuration for the batch engine
|
|
66
81
|
:param Optional[AbstractRunStorage] storage: The storage to store execution results
|
|
67
|
-
:param Optional[int] batch_timeout_sec: The timeout of batch run in seconds
|
|
68
|
-
:param Optional[int] line_timeout_sec: The timeout of each line in seconds
|
|
69
|
-
:param Optional[int] max_worker_count: The concurrency limit of batch run
|
|
70
82
|
:param Optional[Executor] executor: The executor to run the flow (if needed)
|
|
71
83
|
"""
|
|
72
84
|
|
|
73
85
|
self._func: Callable = func
|
|
86
|
+
self._config: BatchEngineConfig = config
|
|
74
87
|
self._storage: AbstractRunStorage = storage or NoOpRunStorage()
|
|
75
88
|
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
self.
|
|
79
|
-
self._line_timeout_sec = line_timeout_sec or get_int_env_var("PF_LINE_TIMEOUT_SEC", 600)
|
|
80
|
-
self._max_worker_count = max_worker_count or get_int_env_var("PF_WORKER_COUNT") or MAX_WORKER_COUNT
|
|
89
|
+
self._batch_timeout_sec = self._config.batch_timeout_seconds
|
|
90
|
+
self._line_timeout_sec = self._config.line_timeout_seconds
|
|
91
|
+
self._max_worker_count = self._config.max_concurrency
|
|
81
92
|
|
|
82
93
|
self._executor: Optional[Executor] = executor
|
|
83
94
|
self._is_canceled: bool = False
|
|
@@ -85,15 +96,13 @@ class BatchEngine:
|
|
|
85
96
|
async def run(
|
|
86
97
|
self,
|
|
87
98
|
data: Sequence[Mapping[str, Any]],
|
|
88
|
-
column_mapping: Mapping[str, str],
|
|
99
|
+
column_mapping: Optional[Mapping[str, str]],
|
|
89
100
|
*,
|
|
90
101
|
id: Optional[str] = None,
|
|
91
102
|
max_lines: Optional[int] = None,
|
|
92
103
|
) -> BatchResult:
|
|
93
104
|
if not data:
|
|
94
105
|
raise BatchEngineValidationError("Please provide a non-empty data mapping.")
|
|
95
|
-
if not column_mapping:
|
|
96
|
-
raise BatchEngineValidationError("The column mapping is required.")
|
|
97
106
|
|
|
98
107
|
start_time = datetime.now(timezone.utc)
|
|
99
108
|
|
|
@@ -105,6 +114,8 @@ class BatchEngine:
|
|
|
105
114
|
id = id or str(uuid4())
|
|
106
115
|
result: BatchResult = await self._exec_in_task(id, batch_inputs, start_time)
|
|
107
116
|
return result
|
|
117
|
+
except EvaluationException:
|
|
118
|
+
raise
|
|
108
119
|
except Exception as ex:
|
|
109
120
|
raise BatchEngineError(
|
|
110
121
|
"Unexpected error while running the batch run.", blame=ErrorBlame.SYSTEM_ERROR
|
|
@@ -114,20 +125,58 @@ class BatchEngine:
|
|
|
114
125
|
# TODO ralphe: Make sure this works
|
|
115
126
|
self._is_canceled = True
|
|
116
127
|
|
|
117
|
-
@staticmethod
|
|
118
128
|
def _apply_column_mapping(
|
|
129
|
+
self,
|
|
119
130
|
data: Sequence[Mapping[str, Any]],
|
|
120
|
-
column_mapping: Mapping[str, str],
|
|
131
|
+
column_mapping: Optional[Mapping[str, str]],
|
|
121
132
|
max_lines: Optional[int],
|
|
122
133
|
) -> Sequence[Mapping[str, str]]:
|
|
134
|
+
|
|
135
|
+
resolved_column_mapping: Mapping[str, str] = self._resolve_column_mapping(column_mapping)
|
|
136
|
+
resolved_column_mapping.update(self._generate_defaults_for_column_mapping())
|
|
137
|
+
return self._apply_column_mapping_to_lines(data, resolved_column_mapping, max_lines)
|
|
138
|
+
|
|
139
|
+
def _resolve_column_mapping(
|
|
140
|
+
self,
|
|
141
|
+
column_mapping: Optional[Mapping[str, str]],
|
|
142
|
+
) -> Mapping[str, str]:
|
|
143
|
+
parameters = inspect.signature(self._func).parameters
|
|
144
|
+
default_column_mapping: Dict[str, str] = {
|
|
145
|
+
name: f"${{data.{name}}}"
|
|
146
|
+
for name, value in parameters.items()
|
|
147
|
+
if name not in ["self", "cls", "args", "kwargs"]
|
|
148
|
+
}
|
|
149
|
+
resolved_mapping: Dict[str, str] = default_column_mapping.copy()
|
|
150
|
+
|
|
151
|
+
for name, value in parameters.items():
|
|
152
|
+
if value and value.default is not inspect.Parameter.empty:
|
|
153
|
+
resolved_mapping.pop(name)
|
|
154
|
+
|
|
155
|
+
resolved_mapping.update(column_mapping or {})
|
|
156
|
+
return resolved_mapping
|
|
157
|
+
|
|
158
|
+
def _generate_defaults_for_column_mapping(self) -> Mapping[Literal["$defaults$"], Any]:
|
|
159
|
+
|
|
160
|
+
return {
|
|
161
|
+
DEFAULTS_KEY: {
|
|
162
|
+
name: value.default
|
|
163
|
+
for name, value in inspect.signature(self._func).parameters.items()
|
|
164
|
+
if value.default is not inspect.Parameter.empty
|
|
165
|
+
}
|
|
166
|
+
}
|
|
167
|
+
|
|
168
|
+
@staticmethod
|
|
169
|
+
def _apply_column_mapping_to_lines(
|
|
170
|
+
data: Sequence[Mapping[str, Any]],
|
|
171
|
+
column_mapping: Mapping[str, str],
|
|
172
|
+
max_lines: Optional[int],
|
|
173
|
+
) -> Sequence[Mapping[str, Any]]:
|
|
123
174
|
data = data[:max_lines] if max_lines else data
|
|
124
175
|
|
|
125
176
|
inputs: Sequence[Mapping[str, Any]] = []
|
|
126
|
-
line: int = 0
|
|
127
177
|
defaults = cast(Mapping[str, Any], column_mapping.get(DEFAULTS_KEY, {}))
|
|
128
178
|
|
|
129
|
-
for input in data:
|
|
130
|
-
line += 1
|
|
179
|
+
for line_number, input in enumerate(data, start=1):
|
|
131
180
|
mapped: Dict[str, Any] = {}
|
|
132
181
|
missing_inputs: Set[str] = set()
|
|
133
182
|
|
|
@@ -148,18 +197,18 @@ class BatchEngine:
|
|
|
148
197
|
continue
|
|
149
198
|
|
|
150
199
|
dict_path = match.group(1)
|
|
151
|
-
found,
|
|
200
|
+
found, mapped_value = get_value_from_path(dict_path, input)
|
|
152
201
|
if not found: # try default value
|
|
153
|
-
found,
|
|
202
|
+
found, mapped_value = get_value_from_path(dict_path, defaults)
|
|
154
203
|
|
|
155
204
|
if found:
|
|
156
|
-
mapped[key] =
|
|
205
|
+
mapped[key] = mapped_value
|
|
157
206
|
else:
|
|
158
207
|
missing_inputs.add(dict_path)
|
|
159
208
|
|
|
160
209
|
if missing_inputs:
|
|
161
210
|
missing = ", ".join(missing_inputs)
|
|
162
|
-
raise BatchEngineValidationError(f"Missing inputs for line {
|
|
211
|
+
raise BatchEngineValidationError(f"Missing inputs for line {line_number}: '{missing}'")
|
|
163
212
|
|
|
164
213
|
inputs.append(mapped)
|
|
165
214
|
|
|
@@ -212,10 +261,12 @@ class BatchEngine:
|
|
|
212
261
|
end_time=None,
|
|
213
262
|
tokens=TokenMetrics(0, 0, 0),
|
|
214
263
|
error=BatchRunError("The line run is not completed.", None),
|
|
264
|
+
index=i,
|
|
215
265
|
)
|
|
216
266
|
)
|
|
217
267
|
for i in range(len(batch_inputs))
|
|
218
268
|
]
|
|
269
|
+
self.handle_line_failures(result_details)
|
|
219
270
|
|
|
220
271
|
for line_result in result_details:
|
|
221
272
|
# Indicate the worst status of the batch run. This works because
|
|
@@ -229,9 +280,15 @@ class BatchEngine:
|
|
|
229
280
|
metrics.total_tokens += line_result.tokens.total_tokens
|
|
230
281
|
|
|
231
282
|
if failed_lines and not error:
|
|
232
|
-
|
|
233
|
-
|
|
283
|
+
error_message = f"{floor(failed_lines / len(batch_inputs) * 100)}% of the batch run failed."
|
|
284
|
+
first_exception: Optional[Exception] = next(
|
|
285
|
+
(result.error.exception for result in result_details if result.error and result.error.exception),
|
|
286
|
+
None,
|
|
234
287
|
)
|
|
288
|
+
if first_exception is not None:
|
|
289
|
+
error_message += f" {first_exception}"
|
|
290
|
+
|
|
291
|
+
error = BatchEngineRunFailedError(error_message)
|
|
235
292
|
|
|
236
293
|
return BatchResult(
|
|
237
294
|
status=status,
|
|
@@ -283,6 +340,13 @@ class BatchEngine:
|
|
|
283
340
|
# TODO ralphe: set logger to use here
|
|
284
341
|
)
|
|
285
342
|
|
|
343
|
+
def __preprocess_inputs(self, inputs: Mapping[str, Any]) -> Mapping[str, Any]:
|
|
344
|
+
|
|
345
|
+
func_params = inspect.signature(self._func).parameters
|
|
346
|
+
|
|
347
|
+
filtered_params = {key: value for key, value in inputs.items() if key in func_params}
|
|
348
|
+
return filtered_params
|
|
349
|
+
|
|
286
350
|
async def _exec_line_async(
|
|
287
351
|
self,
|
|
288
352
|
run_id: str,
|
|
@@ -298,6 +362,7 @@ class BatchEngine:
|
|
|
298
362
|
end_time=None,
|
|
299
363
|
tokens=TokenMetrics(0, 0, 0),
|
|
300
364
|
error=None,
|
|
365
|
+
index=index,
|
|
301
366
|
)
|
|
302
367
|
|
|
303
368
|
try:
|
|
@@ -313,15 +378,17 @@ class BatchEngine:
|
|
|
313
378
|
# For now we will just run the function in the current process, but in the future we may
|
|
314
379
|
# want to consider running the function in a separate process for isolation reasons.
|
|
315
380
|
output: Any
|
|
381
|
+
|
|
382
|
+
processed_inputs = self.__preprocess_inputs(inputs)
|
|
316
383
|
if is_async_callable(self._func):
|
|
317
|
-
output = await self._func(**
|
|
384
|
+
output = await self._func(**processed_inputs)
|
|
318
385
|
else:
|
|
319
386
|
# to maximize the parallelism, we run the synchronous function in a separate thread
|
|
320
387
|
# and await its result
|
|
321
388
|
output = await asyncio.get_event_loop().run_in_executor(
|
|
322
|
-
self._executor,
|
|
323
|
-
|
|
324
|
-
|
|
389
|
+
self._executor, partial(self._func, **processed_inputs)
|
|
390
|
+
)
|
|
391
|
+
|
|
325
392
|
# This should in theory never happen but as an extra precaution, let's check if the output
|
|
326
393
|
# is awaitable and await it if it is.
|
|
327
394
|
if inspect.isawaitable(output):
|
|
@@ -340,6 +407,24 @@ class BatchEngine:
|
|
|
340
407
|
|
|
341
408
|
return index, details
|
|
342
409
|
|
|
410
|
+
@staticmethod
|
|
411
|
+
def handle_line_failures(run_infos: List[BatchRunDetails], raise_on_line_failure: bool = False):
|
|
412
|
+
"""Handle line failures in batch run"""
|
|
413
|
+
failed_run_infos: List[BatchRunDetails] = [r for r in run_infos if r.status == BatchStatus.Failed]
|
|
414
|
+
failed_msg: Optional[str] = None
|
|
415
|
+
if len(failed_run_infos) > 0:
|
|
416
|
+
failed_indexes = ",".join([str(r.index) for r in failed_run_infos])
|
|
417
|
+
first_fail_exception: str = failed_run_infos[0].error.details
|
|
418
|
+
if raise_on_line_failure:
|
|
419
|
+
failed_msg = "Flow run failed due to the error: " + first_fail_exception
|
|
420
|
+
raise Exception(failed_msg)
|
|
421
|
+
|
|
422
|
+
failed_msg = (
|
|
423
|
+
f"{len(failed_run_infos)}/{len(run_infos)} flow run failed, indexes: [{failed_indexes}],"
|
|
424
|
+
f" exception of index {failed_run_infos[0].index}: {first_fail_exception}"
|
|
425
|
+
)
|
|
426
|
+
logger.error(failed_msg)
|
|
427
|
+
|
|
343
428
|
def _persist_run_info(self, line_results: Sequence[BatchRunDetails]):
|
|
344
429
|
# TODO ralphe: implement?
|
|
345
430
|
pass
|
|
@@ -90,7 +90,9 @@ def _openai_api_list() -> Generator[Tuple[Any, Callable, bool], None, None]:
|
|
|
90
90
|
except ImportError:
|
|
91
91
|
raise MissingRequiredPackage("Please install the 'openai' package to use the Azure AI Evaluation SDK")
|
|
92
92
|
except AttributeError:
|
|
93
|
-
logging.warning(
|
|
93
|
+
logging.warning(
|
|
94
|
+
"The module '%s' does not have class '%s' or method '%s'", module_name, class_name, method_name
|
|
95
|
+
)
|
|
94
96
|
|
|
95
97
|
|
|
96
98
|
def inject_openai_api():
|
|
@@ -117,6 +119,7 @@ def recover_openai_api():
|
|
|
117
119
|
|
|
118
120
|
class CaptureOpenAITokenUsage:
|
|
119
121
|
"""Context manager to capture OpenAI token usage."""
|
|
122
|
+
|
|
120
123
|
def __init__(self):
|
|
121
124
|
self._tokens = TokenMetrics(0, 0, 0)
|
|
122
125
|
|
|
@@ -126,4 +129,4 @@ class CaptureOpenAITokenUsage:
|
|
|
126
129
|
|
|
127
130
|
def __exit__(self, exc_type: Optional[Exception], exc_value: Optional[Exception], traceback: Optional[Any]) -> None:
|
|
128
131
|
captured_metrics = _token_metrics.get()
|
|
129
|
-
self._tokens.update(captured_metrics)
|
|
132
|
+
self._tokens.update(captured_metrics)
|
|
@@ -55,6 +55,8 @@ class BatchRunDetails:
|
|
|
55
55
|
"""The token metrics of the line run."""
|
|
56
56
|
error: Optional[BatchRunError]
|
|
57
57
|
"""The error of the line run. This will only be set if the status is Failed."""
|
|
58
|
+
index: int
|
|
59
|
+
"""The line run index."""
|
|
58
60
|
|
|
59
61
|
@property
|
|
60
62
|
def duration(self) -> timedelta:
|
|
@@ -58,7 +58,7 @@ class Run:
|
|
|
58
58
|
dynamic_callable: Callable,
|
|
59
59
|
name_prefix: Optional[str],
|
|
60
60
|
inputs: Sequence[Mapping[str, Any]],
|
|
61
|
-
column_mapping: Mapping[str, str],
|
|
61
|
+
column_mapping: Optional[Mapping[str, str]] = None,
|
|
62
62
|
created_on: Optional[datetime] = None,
|
|
63
63
|
run: Optional["Run"] = None,
|
|
64
64
|
):
|
|
@@ -70,7 +70,7 @@ class Run:
|
|
|
70
70
|
self.dynamic_callable = dynamic_callable
|
|
71
71
|
self.name = self._generate_run_name(name_prefix, self._created_on)
|
|
72
72
|
self.inputs = inputs
|
|
73
|
-
self.column_mapping = column_mapping
|
|
73
|
+
self.column_mapping: Optional[Mapping[str, str]] = column_mapping
|
|
74
74
|
self.result: Optional[BatchResult] = None
|
|
75
75
|
self.metrics: Mapping[str, Any] = {}
|
|
76
76
|
self._run = run
|