azure-ai-evaluation 1.7.0__py3-none-any.whl → 1.9.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- azure/ai/evaluation/__init__.py +13 -2
- azure/ai/evaluation/_aoai/__init__.py +1 -1
- azure/ai/evaluation/_aoai/aoai_grader.py +21 -11
- azure/ai/evaluation/_aoai/label_grader.py +3 -2
- azure/ai/evaluation/_aoai/score_model_grader.py +90 -0
- azure/ai/evaluation/_aoai/string_check_grader.py +3 -2
- azure/ai/evaluation/_aoai/text_similarity_grader.py +3 -2
- azure/ai/evaluation/_azure/_envs.py +9 -10
- azure/ai/evaluation/_azure/_token_manager.py +7 -1
- azure/ai/evaluation/_common/constants.py +11 -2
- azure/ai/evaluation/_common/evaluation_onedp_client.py +32 -26
- azure/ai/evaluation/_common/onedp/__init__.py +32 -32
- azure/ai/evaluation/_common/onedp/_client.py +136 -139
- azure/ai/evaluation/_common/onedp/_configuration.py +70 -73
- azure/ai/evaluation/_common/onedp/_patch.py +21 -21
- azure/ai/evaluation/_common/onedp/_utils/__init__.py +6 -0
- azure/ai/evaluation/_common/onedp/_utils/model_base.py +1232 -0
- azure/ai/evaluation/_common/onedp/_utils/serialization.py +2032 -0
- azure/ai/evaluation/_common/onedp/_validation.py +50 -50
- azure/ai/evaluation/_common/onedp/_version.py +9 -9
- azure/ai/evaluation/_common/onedp/aio/__init__.py +29 -29
- azure/ai/evaluation/_common/onedp/aio/_client.py +138 -143
- azure/ai/evaluation/_common/onedp/aio/_configuration.py +70 -75
- azure/ai/evaluation/_common/onedp/aio/_patch.py +21 -21
- azure/ai/evaluation/_common/onedp/aio/operations/__init__.py +37 -39
- azure/ai/evaluation/_common/onedp/aio/operations/_operations.py +4832 -4494
- azure/ai/evaluation/_common/onedp/aio/operations/_patch.py +21 -21
- azure/ai/evaluation/_common/onedp/models/__init__.py +168 -142
- azure/ai/evaluation/_common/onedp/models/_enums.py +230 -162
- azure/ai/evaluation/_common/onedp/models/_models.py +2685 -2228
- azure/ai/evaluation/_common/onedp/models/_patch.py +21 -21
- azure/ai/evaluation/_common/onedp/operations/__init__.py +37 -39
- azure/ai/evaluation/_common/onedp/operations/_operations.py +6106 -5655
- azure/ai/evaluation/_common/onedp/operations/_patch.py +21 -21
- azure/ai/evaluation/_common/rai_service.py +86 -50
- azure/ai/evaluation/_common/raiclient/__init__.py +1 -1
- azure/ai/evaluation/_common/raiclient/operations/_operations.py +14 -1
- azure/ai/evaluation/_common/utils.py +124 -3
- azure/ai/evaluation/_constants.py +2 -1
- azure/ai/evaluation/_converters/__init__.py +1 -1
- azure/ai/evaluation/_converters/_ai_services.py +9 -8
- azure/ai/evaluation/_converters/_models.py +46 -0
- azure/ai/evaluation/_converters/_sk_services.py +495 -0
- azure/ai/evaluation/_eval_mapping.py +2 -2
- azure/ai/evaluation/_evaluate/_batch_run/_run_submitter_client.py +4 -4
- azure/ai/evaluation/_evaluate/_batch_run/eval_run_context.py +2 -2
- azure/ai/evaluation/_evaluate/_evaluate.py +64 -58
- azure/ai/evaluation/_evaluate/_evaluate_aoai.py +130 -89
- azure/ai/evaluation/_evaluate/_telemetry/__init__.py +0 -1
- azure/ai/evaluation/_evaluate/_utils.py +24 -15
- azure/ai/evaluation/_evaluators/_bleu/_bleu.py +3 -3
- azure/ai/evaluation/_evaluators/_code_vulnerability/_code_vulnerability.py +12 -11
- azure/ai/evaluation/_evaluators/_coherence/_coherence.py +5 -5
- azure/ai/evaluation/_evaluators/_common/_base_eval.py +15 -5
- azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +24 -9
- azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py +6 -1
- azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py +13 -13
- azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py +7 -7
- azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py +7 -7
- azure/ai/evaluation/_evaluators/_content_safety/_sexual.py +7 -7
- azure/ai/evaluation/_evaluators/_content_safety/_violence.py +6 -6
- azure/ai/evaluation/_evaluators/_document_retrieval/__init__.py +1 -5
- azure/ai/evaluation/_evaluators/_document_retrieval/_document_retrieval.py +34 -64
- azure/ai/evaluation/_evaluators/_eci/_eci.py +3 -3
- azure/ai/evaluation/_evaluators/_f1_score/_f1_score.py +4 -4
- azure/ai/evaluation/_evaluators/_fluency/_fluency.py +2 -2
- azure/ai/evaluation/_evaluators/_gleu/_gleu.py +3 -3
- azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +11 -7
- azure/ai/evaluation/_evaluators/_intent_resolution/_intent_resolution.py +30 -25
- azure/ai/evaluation/_evaluators/_intent_resolution/intent_resolution.prompty +210 -96
- azure/ai/evaluation/_evaluators/_meteor/_meteor.py +2 -3
- azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py +6 -6
- azure/ai/evaluation/_evaluators/_qa/_qa.py +4 -4
- azure/ai/evaluation/_evaluators/_relevance/_relevance.py +8 -13
- azure/ai/evaluation/_evaluators/_response_completeness/_response_completeness.py +20 -25
- azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py +4 -4
- azure/ai/evaluation/_evaluators/_rouge/_rouge.py +25 -25
- azure/ai/evaluation/_evaluators/_service_groundedness/_service_groundedness.py +5 -5
- azure/ai/evaluation/_evaluators/_similarity/_similarity.py +3 -3
- azure/ai/evaluation/_evaluators/_task_adherence/_task_adherence.py +11 -14
- azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py +43 -34
- azure/ai/evaluation/_evaluators/_tool_call_accuracy/tool_call_accuracy.prompty +3 -3
- azure/ai/evaluation/_evaluators/_ungrounded_attributes/_ungrounded_attributes.py +12 -11
- azure/ai/evaluation/_evaluators/_xpia/xpia.py +6 -6
- azure/ai/evaluation/_exceptions.py +10 -0
- azure/ai/evaluation/_http_utils.py +3 -3
- azure/ai/evaluation/_legacy/_batch_engine/_engine.py +3 -3
- azure/ai/evaluation/_legacy/_batch_engine/_openai_injector.py +5 -2
- azure/ai/evaluation/_legacy/_batch_engine/_run_submitter.py +5 -10
- azure/ai/evaluation/_legacy/_batch_engine/_utils.py +1 -4
- azure/ai/evaluation/_legacy/_common/_async_token_provider.py +12 -19
- azure/ai/evaluation/_legacy/_common/_thread_pool_executor_with_context.py +2 -0
- azure/ai/evaluation/_legacy/prompty/_prompty.py +11 -5
- azure/ai/evaluation/_safety_evaluation/__init__.py +1 -1
- azure/ai/evaluation/_safety_evaluation/_safety_evaluation.py +193 -111
- azure/ai/evaluation/_user_agent.py +32 -1
- azure/ai/evaluation/_version.py +1 -1
- azure/ai/evaluation/red_team/__init__.py +3 -1
- azure/ai/evaluation/red_team/_agent/__init__.py +3 -0
- azure/ai/evaluation/red_team/_agent/_agent_functions.py +261 -0
- azure/ai/evaluation/red_team/_agent/_agent_tools.py +461 -0
- azure/ai/evaluation/red_team/_agent/_agent_utils.py +89 -0
- azure/ai/evaluation/red_team/_agent/_semantic_kernel_plugin.py +228 -0
- azure/ai/evaluation/red_team/_attack_objective_generator.py +94 -52
- azure/ai/evaluation/red_team/_attack_strategy.py +4 -1
- azure/ai/evaluation/red_team/_callback_chat_target.py +4 -9
- azure/ai/evaluation/red_team/_default_converter.py +1 -1
- azure/ai/evaluation/red_team/_red_team.py +1622 -765
- azure/ai/evaluation/red_team/_red_team_result.py +43 -38
- azure/ai/evaluation/red_team/_utils/__init__.py +1 -1
- azure/ai/evaluation/red_team/_utils/_rai_service_eval_chat_target.py +121 -0
- azure/ai/evaluation/red_team/_utils/_rai_service_target.py +595 -0
- azure/ai/evaluation/red_team/_utils/_rai_service_true_false_scorer.py +108 -0
- azure/ai/evaluation/red_team/_utils/constants.py +6 -12
- azure/ai/evaluation/red_team/_utils/formatting_utils.py +41 -44
- azure/ai/evaluation/red_team/_utils/logging_utils.py +17 -17
- azure/ai/evaluation/red_team/_utils/metric_mapping.py +33 -6
- azure/ai/evaluation/red_team/_utils/strategy_utils.py +35 -25
- azure/ai/evaluation/simulator/_adversarial_scenario.py +2 -0
- azure/ai/evaluation/simulator/_adversarial_simulator.py +34 -16
- azure/ai/evaluation/simulator/_conversation/__init__.py +2 -2
- azure/ai/evaluation/simulator/_direct_attack_simulator.py +8 -8
- azure/ai/evaluation/simulator/_indirect_attack_simulator.py +5 -5
- azure/ai/evaluation/simulator/_model_tools/_generated_rai_client.py +54 -23
- azure/ai/evaluation/simulator/_model_tools/_identity_manager.py +7 -1
- azure/ai/evaluation/simulator/_model_tools/_proxy_completion_model.py +25 -15
- azure/ai/evaluation/simulator/_model_tools/_rai_client.py +19 -31
- azure/ai/evaluation/simulator/_model_tools/_template_handler.py +20 -6
- azure/ai/evaluation/simulator/_model_tools/models.py +1 -1
- azure/ai/evaluation/simulator/_simulator.py +9 -8
- {azure_ai_evaluation-1.7.0.dist-info → azure_ai_evaluation-1.9.0.dist-info}/METADATA +24 -1
- {azure_ai_evaluation-1.7.0.dist-info → azure_ai_evaluation-1.9.0.dist-info}/RECORD +135 -123
- azure/ai/evaluation/_common/onedp/aio/_vendor.py +0 -40
- {azure_ai_evaluation-1.7.0.dist-info → azure_ai_evaluation-1.9.0.dist-info}/NOTICE.txt +0 -0
- {azure_ai_evaluation-1.7.0.dist-info → azure_ai_evaluation-1.9.0.dist-info}/WHEEL +0 -0
- {azure_ai_evaluation-1.7.0.dist-info → azure_ai_evaluation-1.9.0.dist-info}/top_level.txt +0 -0
|
@@ -3,7 +3,7 @@
|
|
|
3
3
|
# ---------------------------------------------------------
|
|
4
4
|
from enum import Enum
|
|
5
5
|
|
|
6
|
-
from typing import Dict
|
|
6
|
+
from typing import Dict, Union
|
|
7
7
|
from typing_extensions import overload, override
|
|
8
8
|
|
|
9
9
|
from azure.ai.evaluation._vendor.rouge_score import rouge_scorer
|
|
@@ -12,7 +12,7 @@ from azure.ai.evaluation._constants import EVALUATION_PASS_FAIL_MAPPING
|
|
|
12
12
|
import math
|
|
13
13
|
|
|
14
14
|
|
|
15
|
-
class RougeType(Enum):
|
|
15
|
+
class RougeType(str, Enum):
|
|
16
16
|
"""
|
|
17
17
|
Enumeration of ROUGE (Recall-Oriented Understudy for Gisting Evaluation) types.
|
|
18
18
|
"""
|
|
@@ -71,13 +71,13 @@ class RougeScoreEvaluator(EvaluatorBase):
|
|
|
71
71
|
:caption: Initialize and call a RougeScoreEvaluator with a four-gram rouge type.
|
|
72
72
|
|
|
73
73
|
.. admonition:: Example using Azure AI Project URL:
|
|
74
|
-
|
|
74
|
+
|
|
75
75
|
.. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
|
|
76
76
|
:start-after: [START rouge_score_evaluator]
|
|
77
77
|
:end-before: [END rouge_score_evaluator]
|
|
78
78
|
:language: python
|
|
79
79
|
:dedent: 8
|
|
80
|
-
:caption: Initialize and call RougeScoreEvaluator using Azure AI Project URL in the following format
|
|
80
|
+
:caption: Initialize and call RougeScoreEvaluator using Azure AI Project URL in the following format
|
|
81
81
|
https://{resource_name}.services.ai.azure.com/api/projects/{project_name}
|
|
82
82
|
|
|
83
83
|
.. admonition:: Example with threshold:
|
|
@@ -95,17 +95,17 @@ class RougeScoreEvaluator(EvaluatorBase):
|
|
|
95
95
|
|
|
96
96
|
@override
|
|
97
97
|
def __init__(
|
|
98
|
-
self,
|
|
98
|
+
self,
|
|
99
99
|
rouge_type: RougeType,
|
|
100
100
|
*,
|
|
101
101
|
precision_threshold: float = 0.5,
|
|
102
102
|
recall_threshold: float = 0.5,
|
|
103
|
-
f1_score_threshold: float = 0.5
|
|
103
|
+
f1_score_threshold: float = 0.5,
|
|
104
104
|
):
|
|
105
105
|
self._rouge_type = rouge_type
|
|
106
106
|
self._higher_is_better = True
|
|
107
107
|
super().__init__()
|
|
108
|
-
|
|
108
|
+
|
|
109
109
|
# Type checking for threshold parameters
|
|
110
110
|
for name, value in [
|
|
111
111
|
("precision_threshold", precision_threshold),
|
|
@@ -114,7 +114,7 @@ class RougeScoreEvaluator(EvaluatorBase):
|
|
|
114
114
|
]:
|
|
115
115
|
if not isinstance(value, float):
|
|
116
116
|
raise TypeError(f"{name} must be a float, got {type(value)}")
|
|
117
|
-
|
|
117
|
+
|
|
118
118
|
self._threshold = {
|
|
119
119
|
"precision": precision_threshold,
|
|
120
120
|
"recall": recall_threshold,
|
|
@@ -122,10 +122,10 @@ class RougeScoreEvaluator(EvaluatorBase):
|
|
|
122
122
|
}
|
|
123
123
|
|
|
124
124
|
def _get_binary_result(
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
125
|
+
self,
|
|
126
|
+
rouge_precision: float,
|
|
127
|
+
rouge_recall: float,
|
|
128
|
+
rouge_f1_score: float,
|
|
129
129
|
) -> Dict[str, bool]:
|
|
130
130
|
"""
|
|
131
131
|
Get binary result based on the threshold.
|
|
@@ -150,22 +150,22 @@ class RougeScoreEvaluator(EvaluatorBase):
|
|
|
150
150
|
precision_valid = not math.isnan(rouge_precision)
|
|
151
151
|
recall_valid = not math.isnan(rouge_recall)
|
|
152
152
|
f1_valid = not math.isnan(rouge_f1_score)
|
|
153
|
-
|
|
153
|
+
|
|
154
154
|
if self._higher_is_better:
|
|
155
155
|
if precision_valid:
|
|
156
|
-
results["rouge_precision_result"] =
|
|
156
|
+
results["rouge_precision_result"] = rouge_precision >= self._threshold["precision"]
|
|
157
157
|
if recall_valid:
|
|
158
|
-
results["rouge_recall_result"] =
|
|
158
|
+
results["rouge_recall_result"] = rouge_recall >= self._threshold["recall"]
|
|
159
159
|
if f1_valid:
|
|
160
|
-
results["rouge_f1_score_result"] =
|
|
160
|
+
results["rouge_f1_score_result"] = rouge_f1_score >= self._threshold["f1_score"]
|
|
161
161
|
else:
|
|
162
162
|
if precision_valid:
|
|
163
|
-
results["rouge_precision_result"] =
|
|
163
|
+
results["rouge_precision_result"] = rouge_precision <= self._threshold["precision"]
|
|
164
164
|
if recall_valid:
|
|
165
|
-
results["rouge_recall_result"] =
|
|
165
|
+
results["rouge_recall_result"] = rouge_recall <= self._threshold["recall"]
|
|
166
166
|
if f1_valid:
|
|
167
|
-
results["rouge_f1_score_result"] =
|
|
168
|
-
|
|
167
|
+
results["rouge_f1_score_result"] = rouge_f1_score <= self._threshold["f1_score"]
|
|
168
|
+
|
|
169
169
|
return results
|
|
170
170
|
|
|
171
171
|
@override
|
|
@@ -179,17 +179,17 @@ class RougeScoreEvaluator(EvaluatorBase):
|
|
|
179
179
|
"""
|
|
180
180
|
ground_truth = eval_input["ground_truth"]
|
|
181
181
|
response = eval_input["response"]
|
|
182
|
-
scorer = rouge_scorer.RougeScorer(rouge_types=[self._rouge_type
|
|
183
|
-
metrics = scorer.score(ground_truth, response)[self._rouge_type
|
|
182
|
+
scorer = rouge_scorer.RougeScorer(rouge_types=[self._rouge_type])
|
|
183
|
+
metrics = scorer.score(ground_truth, response)[self._rouge_type]
|
|
184
184
|
binary_results = {
|
|
185
185
|
"rouge_precision_result": False,
|
|
186
186
|
"rouge_recall_result": False,
|
|
187
187
|
"rouge_f1_score_result": False,
|
|
188
188
|
}
|
|
189
189
|
# Convert metrics to floats, using nan for None or non-convertible values
|
|
190
|
-
rouge_precision = float(metrics.precision) if metrics.precision is not None else float(
|
|
191
|
-
rouge_recall = float(metrics.recall) if metrics.recall is not None else float(
|
|
192
|
-
rouge_f1_score = float(metrics.fmeasure) if metrics.fmeasure is not None else float(
|
|
190
|
+
rouge_precision = float(metrics.precision) if metrics.precision is not None else float("nan")
|
|
191
|
+
rouge_recall = float(metrics.recall) if metrics.recall is not None else float("nan")
|
|
192
|
+
rouge_f1_score = float(metrics.fmeasure) if metrics.fmeasure is not None else float("nan")
|
|
193
193
|
binary_results = self._get_binary_result(
|
|
194
194
|
rouge_precision=rouge_precision,
|
|
195
195
|
rouge_recall=rouge_recall,
|
|
@@ -24,9 +24,9 @@ class GroundednessProEvaluator(RaiServiceEvaluatorBase[Union[str, bool]]):
|
|
|
24
24
|
|
|
25
25
|
:param credential: The credential for connecting to Azure AI project. Required
|
|
26
26
|
:type credential: ~azure.core.credentials.TokenCredential
|
|
27
|
-
:param azure_ai_project: The
|
|
28
|
-
It contains subscription id, resource group, and project name.
|
|
29
|
-
:type azure_ai_project: ~azure.ai.evaluation.AzureAIProject
|
|
27
|
+
:param azure_ai_project: The Azure AI project, which can either be a string representing the project endpoint
|
|
28
|
+
or an instance of AzureAIProject. It contains subscription id, resource group, and project name.
|
|
29
|
+
:type azure_ai_project: Union[str, ~azure.ai.evaluation.AzureAIProject]
|
|
30
30
|
:param threshold: The threshold for the groundedness pro evaluator. Default is 5.
|
|
31
31
|
:type threshold: int
|
|
32
32
|
:param kwargs: Additional arguments to pass to the evaluator.
|
|
@@ -42,13 +42,13 @@ class GroundednessProEvaluator(RaiServiceEvaluatorBase[Union[str, bool]]):
|
|
|
42
42
|
:caption: Initialize and call a GroundednessProEvaluator with a query, response, and context.
|
|
43
43
|
|
|
44
44
|
.. admonition:: Example using Azure AI Project URL:
|
|
45
|
-
|
|
45
|
+
|
|
46
46
|
.. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
|
|
47
47
|
:start-after: [START groundedness_pro_evaluator]
|
|
48
48
|
:end-before: [END groundedness_pro_evaluator]
|
|
49
49
|
:language: python
|
|
50
50
|
:dedent: 8
|
|
51
|
-
:caption: Initialize and call GroundednessProEvaluator using Azure AI Project URL in the following format
|
|
51
|
+
:caption: Initialize and call GroundednessProEvaluator using Azure AI Project URL in the following format
|
|
52
52
|
https://{resource_name}.services.ai.azure.com/api/projects/{project_name}
|
|
53
53
|
|
|
54
54
|
.. admonition:: Example with threshold:
|
|
@@ -41,13 +41,13 @@ class SimilarityEvaluator(PromptyEvaluatorBase):
|
|
|
41
41
|
:caption: Initialize and call a SimilarityEvaluator with a four-gram rouge type.
|
|
42
42
|
|
|
43
43
|
.. admonition:: Example using Azure AI Project URL:
|
|
44
|
-
|
|
44
|
+
|
|
45
45
|
.. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
|
|
46
46
|
:start-after: [START similarity_evaluator]
|
|
47
47
|
:end-before: [END similarity_evaluator]
|
|
48
48
|
:language: python
|
|
49
49
|
:dedent: 8
|
|
50
|
-
:caption: Initialize and call SimilarityEvaluator using Azure AI Project URL in the following format
|
|
50
|
+
:caption: Initialize and call SimilarityEvaluator using Azure AI Project URL in the following format
|
|
51
51
|
https://{resource_name}.services.ai.azure.com/api/projects/{project_name}
|
|
52
52
|
|
|
53
53
|
.. admonition:: Example:
|
|
@@ -85,7 +85,7 @@ class SimilarityEvaluator(PromptyEvaluatorBase):
|
|
|
85
85
|
prompty_file=prompty_path,
|
|
86
86
|
result_key=self._RESULT_KEY,
|
|
87
87
|
threshold=threshold,
|
|
88
|
-
_higher_is_better=self._higher_is_better
|
|
88
|
+
_higher_is_better=self._higher_is_better,
|
|
89
89
|
)
|
|
90
90
|
|
|
91
91
|
# Ignoring a mypy error about having only 1 overload function.
|
|
@@ -13,6 +13,7 @@ from azure.ai.evaluation._common.utils import parse_quality_evaluator_reason_sco
|
|
|
13
13
|
from azure.ai.evaluation._model_configurations import Message
|
|
14
14
|
from azure.ai.evaluation._common._experimental import experimental
|
|
15
15
|
|
|
16
|
+
|
|
16
17
|
@experimental
|
|
17
18
|
class TaskAdherenceEvaluator(PromptyEvaluatorBase[Union[str, float]]):
|
|
18
19
|
"""The Task Adherence evaluator assesses how well an AI-generated response follows the assigned task based on:
|
|
@@ -42,15 +43,15 @@ class TaskAdherenceEvaluator(PromptyEvaluatorBase[Union[str, float]]):
|
|
|
42
43
|
:language: python
|
|
43
44
|
:dedent: 8
|
|
44
45
|
:caption: Initialize and call an TaskAdherenceEvaluator with a query and response.
|
|
45
|
-
|
|
46
|
+
|
|
46
47
|
.. admonition:: Example using Azure AI Project URL:
|
|
47
|
-
|
|
48
|
+
|
|
48
49
|
.. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
|
|
49
50
|
:start-after: [START task_adherence_evaluator]
|
|
50
51
|
:end-before: [END task_adherence_evaluator]
|
|
51
52
|
:language: python
|
|
52
53
|
:dedent: 8
|
|
53
|
-
:caption: Initialize and call TaskAdherenceEvaluator using Azure AI Project URL in the following format
|
|
54
|
+
:caption: Initialize and call TaskAdherenceEvaluator using Azure AI Project URL in the following format
|
|
54
55
|
https://{resource_name}.services.ai.azure.com/api/projects/{project_name}
|
|
55
56
|
|
|
56
57
|
"""
|
|
@@ -65,14 +66,11 @@ class TaskAdherenceEvaluator(PromptyEvaluatorBase[Union[str, float]]):
|
|
|
65
66
|
"""Evaluator identifier, experimental and to be used only with evaluation in cloud."""
|
|
66
67
|
|
|
67
68
|
@override
|
|
68
|
-
def __init__(self, model_config, *, threshold=_DEFAULT_TASK_ADHERENCE_SCORE,
|
|
69
|
-
**kwargs):
|
|
69
|
+
def __init__(self, model_config, *, threshold=_DEFAULT_TASK_ADHERENCE_SCORE, **kwargs):
|
|
70
70
|
current_dir = os.path.dirname(__file__)
|
|
71
71
|
prompty_path = os.path.join(current_dir, self._PROMPTY_FILE)
|
|
72
72
|
self.threshold = threshold
|
|
73
|
-
super().__init__(model_config=model_config, prompty_file=prompty_path,
|
|
74
|
-
result_key=self._RESULT_KEY,
|
|
75
|
-
**kwargs)
|
|
73
|
+
super().__init__(model_config=model_config, prompty_file=prompty_path, result_key=self._RESULT_KEY, **kwargs)
|
|
76
74
|
|
|
77
75
|
@overload
|
|
78
76
|
def __call__(
|
|
@@ -85,7 +83,7 @@ class TaskAdherenceEvaluator(PromptyEvaluatorBase[Union[str, float]]):
|
|
|
85
83
|
"""Evaluate task adherence for a given query, response, and optional tool defintions.
|
|
86
84
|
The query and response can be either a string or a list of messages.
|
|
87
85
|
|
|
88
|
-
|
|
86
|
+
|
|
89
87
|
Example with string inputs and no tools:
|
|
90
88
|
evaluator = TaskAdherenceEvaluator(model_config)
|
|
91
89
|
query = "What is the weather today?"
|
|
@@ -113,9 +111,9 @@ class TaskAdherenceEvaluator(PromptyEvaluatorBase[Union[str, float]]):
|
|
|
113
111
|
|
|
114
112
|
@override
|
|
115
113
|
def __call__( # pylint: disable=docstring-missing-param
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
114
|
+
self,
|
|
115
|
+
*args,
|
|
116
|
+
**kwargs,
|
|
119
117
|
):
|
|
120
118
|
"""
|
|
121
119
|
Invokes the instance using the overloaded __call__ signature.
|
|
@@ -149,7 +147,7 @@ class TaskAdherenceEvaluator(PromptyEvaluatorBase[Union[str, float]]):
|
|
|
149
147
|
if llm_output:
|
|
150
148
|
score, reason = parse_quality_evaluator_reason_score(llm_output, valid_score_range="[1-5]")
|
|
151
149
|
|
|
152
|
-
score_result =
|
|
150
|
+
score_result = "pass" if score >= self.threshold else "fail"
|
|
153
151
|
|
|
154
152
|
return {
|
|
155
153
|
f"{self._result_key}": score,
|
|
@@ -159,4 +157,3 @@ class TaskAdherenceEvaluator(PromptyEvaluatorBase[Union[str, float]]):
|
|
|
159
157
|
}
|
|
160
158
|
|
|
161
159
|
return {self._result_key: math.nan}
|
|
162
|
-
|
|
@@ -17,6 +17,7 @@ logger = logging.getLogger(__name__)
|
|
|
17
17
|
|
|
18
18
|
T_EvalValue = TypeVar("T_EvalValue")
|
|
19
19
|
|
|
20
|
+
|
|
20
21
|
@experimental
|
|
21
22
|
class ToolCallAccuracyEvaluator(PromptyEvaluatorBase[Union[str, float]]):
|
|
22
23
|
"""The Tool Call Accuracy evaluator assesses how accurately an AI uses tools by examining:
|
|
@@ -46,13 +47,13 @@ class ToolCallAccuracyEvaluator(PromptyEvaluatorBase[Union[str, float]]):
|
|
|
46
47
|
:caption: Initialize and call a ToolCallAccuracyEvaluator.
|
|
47
48
|
|
|
48
49
|
.. admonition:: Example using Azure AI Project URL:
|
|
49
|
-
|
|
50
|
+
|
|
50
51
|
.. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
|
|
51
52
|
:start-after: [START tool_call_accuracy_evaluator]
|
|
52
53
|
:end-before: [END tool_call_accuracy_evaluator]
|
|
53
54
|
:language: python
|
|
54
55
|
:dedent: 8
|
|
55
|
-
:caption: Initialize and call ToolCallAccuracyEvaluator using Azure AI Project URL in the following format
|
|
56
|
+
:caption: Initialize and call ToolCallAccuracyEvaluator using Azure AI Project URL in the following format
|
|
56
57
|
https://{resource_name}.services.ai.azure.com/api/projects/{project_name}
|
|
57
58
|
|
|
58
59
|
.. note::
|
|
@@ -74,15 +75,11 @@ class ToolCallAccuracyEvaluator(PromptyEvaluatorBase[Union[str, float]]):
|
|
|
74
75
|
"""Evaluator identifier, experimental and to be used only with evaluation in cloud."""
|
|
75
76
|
|
|
76
77
|
@override
|
|
77
|
-
def __init__(self, model_config, *,
|
|
78
|
-
threshold=_DEFAULT_TOOL_CALL_ACCURACY_SCORE,
|
|
79
|
-
**kwargs):
|
|
78
|
+
def __init__(self, model_config, *, threshold=_DEFAULT_TOOL_CALL_ACCURACY_SCORE, **kwargs):
|
|
80
79
|
current_dir = os.path.dirname(__file__)
|
|
81
80
|
prompty_path = os.path.join(current_dir, self._PROMPTY_FILE)
|
|
82
81
|
self.threshold = threshold
|
|
83
|
-
super().__init__(model_config=model_config, prompty_file=prompty_path,
|
|
84
|
-
result_key=self._RESULT_KEY,
|
|
85
|
-
**kwargs)
|
|
82
|
+
super().__init__(model_config=model_config, prompty_file=prompty_path, result_key=self._RESULT_KEY, **kwargs)
|
|
86
83
|
|
|
87
84
|
@overload
|
|
88
85
|
def __call__(
|
|
@@ -90,8 +87,8 @@ class ToolCallAccuracyEvaluator(PromptyEvaluatorBase[Union[str, float]]):
|
|
|
90
87
|
*,
|
|
91
88
|
query: Union[str, List[dict]],
|
|
92
89
|
tool_definitions: Union[dict, List[dict]],
|
|
93
|
-
tool_calls: Union[dict, List[dict]]
|
|
94
|
-
response: Union[str, List[dict]] = None
|
|
90
|
+
tool_calls: Union[dict, List[dict]] = None,
|
|
91
|
+
response: Union[str, List[dict]] = None,
|
|
95
92
|
) -> Dict[str, Union[str, float]]:
|
|
96
93
|
"""
|
|
97
94
|
Evaluate tool call accuracy. Accepts a query, tool definitions, and tool calls for evaluation.
|
|
@@ -165,8 +162,9 @@ class ToolCallAccuracyEvaluator(PromptyEvaluatorBase[Union[str, float]]):
|
|
|
165
162
|
if isinstance(response, list):
|
|
166
163
|
for message in response:
|
|
167
164
|
if message.get("role") == "assistant":
|
|
168
|
-
tool_calls.extend(
|
|
169
|
-
|
|
165
|
+
tool_calls.extend(
|
|
166
|
+
[content for content in message.get("content") if content.get("type") == "tool_call"]
|
|
167
|
+
)
|
|
170
168
|
if len(tool_calls) == 0:
|
|
171
169
|
raise EvaluationException(
|
|
172
170
|
message="response does not have tool calls. Either provide tool_calls or response with tool calls.",
|
|
@@ -185,7 +183,9 @@ class ToolCallAccuracyEvaluator(PromptyEvaluatorBase[Union[str, float]]):
|
|
|
185
183
|
# TODO : When evaluating an agent tool that depends on the output of a previous tool call,
|
|
186
184
|
# we need to provide the output of the previous tool call as part of messages.
|
|
187
185
|
for tool_call in tool_calls:
|
|
188
|
-
if
|
|
186
|
+
if (
|
|
187
|
+
isinstance(tool_call, dict) and tool_call.get("type") == "tool_call"
|
|
188
|
+
): # TODO assuming dict here but it can be a class
|
|
189
189
|
function_name = tool_call.get("name")
|
|
190
190
|
tool_definition = [tool for tool in tool_definitions if tool.get("name") == function_name]
|
|
191
191
|
if len(tool_definition) > 0:
|
|
@@ -228,7 +228,7 @@ class ToolCallAccuracyEvaluator(PromptyEvaluatorBase[Union[str, float]]):
|
|
|
228
228
|
return {
|
|
229
229
|
self._result_key: bool(float(score)),
|
|
230
230
|
f"{self._result_key}_reason": reason,
|
|
231
|
-
"tool_call_id"
|
|
231
|
+
"tool_call_id": eval_input.get("tool_call").get("tool_call_id"),
|
|
232
232
|
}
|
|
233
233
|
raise EvaluationException(
|
|
234
234
|
message="Tool call accuracy evaluator: Invalid score returned from LLM.",
|
|
@@ -248,13 +248,13 @@ class ToolCallAccuracyEvaluator(PromptyEvaluatorBase[Union[str, float]]):
|
|
|
248
248
|
# Convert inputs into list of evaluable inputs.
|
|
249
249
|
eval_input_list = self._convert_kwargs_to_eval_input(**kwargs)
|
|
250
250
|
if len(eval_input_list) == 0:
|
|
251
|
-
return {
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
251
|
+
return {
|
|
252
|
+
self._AGGREGATE_RESULT_KEY: self._NOT_APPLICABLE_RESULT,
|
|
253
|
+
f"{self._AGGREGATE_RESULT_KEY}_result": self._NOT_APPLICABLE_RESULT,
|
|
254
|
+
f"{self._AGGREGATE_RESULT_KEY}_threshold": self.threshold,
|
|
255
|
+
f"{self._AGGREGATE_RESULT_KEY}_reason": "No tool calls were made.",
|
|
256
|
+
"per_tool_call_details": [],
|
|
257
|
+
}
|
|
258
258
|
|
|
259
259
|
per_turn_results = []
|
|
260
260
|
# Evaluate all inputs.
|
|
@@ -293,7 +293,7 @@ class ToolCallAccuracyEvaluator(PromptyEvaluatorBase[Union[str, float]]):
|
|
|
293
293
|
return {
|
|
294
294
|
f"{self._result_key}": self._NOT_APPLICABLE_RESULT,
|
|
295
295
|
f"{self._result_key}_reason": "Tool call not supported for evaluation",
|
|
296
|
-
"tool_call_id"
|
|
296
|
+
"tool_call_id": eval_input.get("tool_call").get("tool_call_id"),
|
|
297
297
|
}
|
|
298
298
|
|
|
299
299
|
def _aggregate_results(self, per_turn_results):
|
|
@@ -318,23 +318,32 @@ class ToolCallAccuracyEvaluator(PromptyEvaluatorBase[Union[str, float]]):
|
|
|
318
318
|
# Go over each turn, and rotate the results into a
|
|
319
319
|
# metric: List[values] format for the evals_per_turn dictionary.
|
|
320
320
|
|
|
321
|
-
num_evaluated = len(
|
|
322
|
-
|
|
321
|
+
num_evaluated = len(
|
|
322
|
+
[
|
|
323
|
+
per_turn_result
|
|
324
|
+
for per_turn_result in per_turn_results
|
|
325
|
+
if per_turn_result.get(self._result_key) != self._NOT_APPLICABLE_RESULT
|
|
326
|
+
]
|
|
327
|
+
)
|
|
323
328
|
if num_evaluated == 0:
|
|
324
329
|
# None of the invoked tools were applicable, return not applicable result
|
|
325
330
|
# (If a tool fails evaluation, we'll throw an exception)
|
|
326
|
-
return {
|
|
327
|
-
|
|
328
|
-
|
|
329
|
-
|
|
330
|
-
|
|
331
|
-
|
|
332
|
-
|
|
331
|
+
return {
|
|
332
|
+
self._AGGREGATE_RESULT_KEY: self._NOT_APPLICABLE_RESULT,
|
|
333
|
+
f"{self._AGGREGATE_RESULT_KEY}_result": self._NOT_APPLICABLE_RESULT,
|
|
334
|
+
f"{self._AGGREGATE_RESULT_KEY}_threshold": self.threshold,
|
|
335
|
+
f"{self._AGGREGATE_RESULT_KEY}_reason": "Tool call accuracy evaluation is not yet supported for the invoked tools.",
|
|
336
|
+
"per_tool_call_details": [],
|
|
337
|
+
}
|
|
333
338
|
# ignore not_applicable results, where the _result_key will be "not applicable"
|
|
334
|
-
score =
|
|
339
|
+
score = (
|
|
340
|
+
sum([per_turn_result.get(self._result_key) == True for per_turn_result in per_turn_results]) / num_evaluated
|
|
341
|
+
)
|
|
335
342
|
aggregated[self._AGGREGATE_RESULT_KEY] = score
|
|
336
|
-
aggregated[f
|
|
337
|
-
|
|
343
|
+
aggregated[f"{self._AGGREGATE_RESULT_KEY}_result"] = (
|
|
344
|
+
self._PASS_RESULT if score >= self.threshold else self._FAIL_RESULT
|
|
345
|
+
)
|
|
346
|
+
aggregated[f"{self._AGGREGATE_RESULT_KEY}_threshold"] = self.threshold
|
|
338
347
|
aggregated["per_tool_call_details"] = per_turn_results
|
|
339
348
|
return aggregated
|
|
340
349
|
|
|
@@ -8,27 +8,28 @@ from azure.ai.evaluation._common._experimental import experimental
|
|
|
8
8
|
from azure.ai.evaluation._common.constants import EvaluationMetrics
|
|
9
9
|
from azure.ai.evaluation._evaluators._common import RaiServiceEvaluatorBase
|
|
10
10
|
|
|
11
|
+
|
|
11
12
|
@experimental
|
|
12
13
|
class UngroundedAttributesEvaluator(RaiServiceEvaluatorBase[Union[str, bool]]):
|
|
13
14
|
"""
|
|
14
|
-
Evaluates ungrounded inference of human attributes for a given query, response, and context for a single-turn evaluation only,
|
|
15
|
-
where query represents the user query and response represents the AI system response given the provided context.
|
|
16
|
-
|
|
17
|
-
Ungrounded Attributes checks for whether a response is first, ungrounded, and checks if it contains information about protected class or
|
|
15
|
+
Evaluates ungrounded inference of human attributes for a given query, response, and context for a single-turn evaluation only,
|
|
16
|
+
where query represents the user query and response represents the AI system response given the provided context.
|
|
17
|
+
|
|
18
|
+
Ungrounded Attributes checks for whether a response is first, ungrounded, and checks if it contains information about protected class or
|
|
18
19
|
emotional state of a person.
|
|
19
20
|
|
|
20
21
|
|
|
21
22
|
It identifies the following attributes:
|
|
22
|
-
|
|
23
|
+
|
|
23
24
|
- emotional_state
|
|
24
25
|
- protected_class
|
|
25
26
|
- groundedness
|
|
26
27
|
|
|
27
28
|
:param credential: The credential for connecting to Azure AI project. Required
|
|
28
29
|
:type credential: ~azure.core.credentials.TokenCredential
|
|
29
|
-
:param azure_ai_project: The
|
|
30
|
-
It contains subscription id, resource group, and project name.
|
|
31
|
-
:type azure_ai_project: ~azure.ai.evaluation.AzureAIProject
|
|
30
|
+
:param azure_ai_project: The Azure AI project, which can either be a string representing the project endpoint
|
|
31
|
+
or an instance of AzureAIProject. It contains subscription id, resource group, and project name.
|
|
32
|
+
:type azure_ai_project: Union[str, ~azure.ai.evaluation.AzureAIProject]
|
|
32
33
|
:param kwargs: Additional arguments to pass to the evaluator.
|
|
33
34
|
:type kwargs: Any
|
|
34
35
|
|
|
@@ -42,13 +43,13 @@ class UngroundedAttributesEvaluator(RaiServiceEvaluatorBase[Union[str, bool]]):
|
|
|
42
43
|
:caption: Initialize and call a UngroundedAttributesEvaluator with a query, response and context.
|
|
43
44
|
|
|
44
45
|
.. admonition:: Example using Azure AI Project URL:
|
|
45
|
-
|
|
46
|
+
|
|
46
47
|
.. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
|
|
47
48
|
:start-after: [START ungrounded_attributes_evaluator]
|
|
48
49
|
:end-before: [END ungrounded_attributes_evaluator]
|
|
49
50
|
:language: python
|
|
50
51
|
:dedent: 8
|
|
51
|
-
:caption: Initialize and call UngroundedAttributesEvaluator using Azure AI Project URL in the following format
|
|
52
|
+
:caption: Initialize and call UngroundedAttributesEvaluator using Azure AI Project URL in the following format
|
|
52
53
|
https://{resource_name}.services.ai.azure.com/api/projects/{project_name}
|
|
53
54
|
|
|
54
55
|
.. note::
|
|
@@ -109,5 +110,5 @@ class UngroundedAttributesEvaluator(RaiServiceEvaluatorBase[Union[str, bool]]):
|
|
|
109
110
|
:return: The ungrounded attributes label.
|
|
110
111
|
:rtype: Dict[str, Union[str, bool]]
|
|
111
112
|
"""
|
|
112
|
-
|
|
113
|
+
|
|
113
114
|
return super().__call__(*args, **kwargs)
|
|
@@ -40,9 +40,9 @@ class IndirectAttackEvaluator(RaiServiceEvaluatorBase[Union[str, bool]]):
|
|
|
40
40
|
|
|
41
41
|
:param credential: The credential for connecting to Azure AI project. Required
|
|
42
42
|
:type credential: ~azure.core.credentials.TokenCredential
|
|
43
|
-
:param azure_ai_project: The
|
|
44
|
-
name.
|
|
45
|
-
:type azure_ai_project: ~azure.ai.evaluation.AzureAIProject
|
|
43
|
+
:param azure_ai_project: The Azure AI project, which can either be a string representing the project endpoint
|
|
44
|
+
or an instance of AzureAIProject. It contains subscription id, resource group, and project name.
|
|
45
|
+
:type azure_ai_project: Union[str, ~azure.ai.evaluation.AzureAIProject]
|
|
46
46
|
:param threshold: The threshold for the IndirectAttack evaluator. Default is 0.
|
|
47
47
|
:type threshold: int
|
|
48
48
|
|
|
@@ -54,15 +54,15 @@ class IndirectAttackEvaluator(RaiServiceEvaluatorBase[Union[str, bool]]):
|
|
|
54
54
|
:language: python
|
|
55
55
|
:dedent: 8
|
|
56
56
|
:caption: Initialize and call an IndirectAttackEvaluator.
|
|
57
|
-
|
|
57
|
+
|
|
58
58
|
.. admonition:: Example using Azure AI Project URL:
|
|
59
|
-
|
|
59
|
+
|
|
60
60
|
.. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
|
|
61
61
|
:start-after: [START indirect_attack_evaluator]
|
|
62
62
|
:end-before: [END indirect_attack_evaluator]
|
|
63
63
|
:language: python
|
|
64
64
|
:dedent: 8
|
|
65
|
-
:caption: Initialize and call IndirectAttackEvaluator using Azure AI Project URL in the following format
|
|
65
|
+
:caption: Initialize and call IndirectAttackEvaluator using Azure AI Project URL in the following format
|
|
66
66
|
https://{resource_name}.services.ai.azure.com/api/projects/{project_name}
|
|
67
67
|
|
|
68
68
|
"""
|
|
@@ -9,6 +9,15 @@ from typing import Optional
|
|
|
9
9
|
from azure.core.exceptions import AzureError
|
|
10
10
|
|
|
11
11
|
|
|
12
|
+
class ErrorMessage(Enum):
|
|
13
|
+
"""Error messages to be used when raising EvaluationException.
|
|
14
|
+
|
|
15
|
+
These messages are used to provide a consistent error message format across the SDK.
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
MALFORMED_CONVERSATION_HISTORY = "Malformed Conversation History: Query parameter representing conversation history should have exactly one more user query than agent responses"
|
|
19
|
+
|
|
20
|
+
|
|
12
21
|
class ErrorCategory(Enum):
|
|
13
22
|
"""Error category to be specified when using EvaluationException class.
|
|
14
23
|
|
|
@@ -87,6 +96,7 @@ class ErrorTarget(Enum):
|
|
|
87
96
|
TOOL_CALL_ACCURACY_EVALUATOR = "ToolCallAccuracyEvaluator"
|
|
88
97
|
RED_TEAM = "RedTeam"
|
|
89
98
|
AOAI_GRADER = "AoaiGrader"
|
|
99
|
+
CONVERSATION_HISTORY_PARSING = "_get_conversation_history"
|
|
90
100
|
|
|
91
101
|
|
|
92
102
|
class EvaluationException(AzureError):
|
|
@@ -7,7 +7,7 @@ from typing import Any, Dict, MutableMapping, Optional, TypedDict, cast
|
|
|
7
7
|
|
|
8
8
|
from typing_extensions import Self, Unpack
|
|
9
9
|
|
|
10
|
-
from azure.ai.evaluation._user_agent import
|
|
10
|
+
from azure.ai.evaluation._user_agent import UserAgentSingleton
|
|
11
11
|
from azure.core.configuration import Configuration
|
|
12
12
|
from azure.core.pipeline import AsyncPipeline, Pipeline
|
|
13
13
|
from azure.core.pipeline.policies import (
|
|
@@ -454,7 +454,7 @@ def get_http_client(**kwargs: Any) -> HttpPipeline:
|
|
|
454
454
|
:returns: An HttpPipeline with a set of applied policies:
|
|
455
455
|
:rtype: HttpPipeline
|
|
456
456
|
"""
|
|
457
|
-
kwargs.setdefault("user_agent_policy", UserAgentPolicy(base_user_agent=
|
|
457
|
+
kwargs.setdefault("user_agent_policy", UserAgentPolicy(base_user_agent=UserAgentSingleton().value))
|
|
458
458
|
return HttpPipeline(**kwargs)
|
|
459
459
|
|
|
460
460
|
|
|
@@ -464,5 +464,5 @@ def get_async_http_client(**kwargs: Any) -> AsyncHttpPipeline:
|
|
|
464
464
|
:returns: An AsyncHttpPipeline with a set of applied policies:
|
|
465
465
|
:rtype: AsyncHttpPipeline
|
|
466
466
|
"""
|
|
467
|
-
kwargs.setdefault("user_agent_policy", UserAgentPolicy(base_user_agent=
|
|
467
|
+
kwargs.setdefault("user_agent_policy", UserAgentPolicy(base_user_agent=UserAgentSingleton().value))
|
|
468
468
|
return AsyncHttpPipeline(**kwargs)
|
|
@@ -319,9 +319,9 @@ class BatchEngine:
|
|
|
319
319
|
# to maximize the parallelism, we run the synchronous function in a separate thread
|
|
320
320
|
# and await its result
|
|
321
321
|
output = await asyncio.get_event_loop().run_in_executor(
|
|
322
|
-
self._executor,
|
|
323
|
-
|
|
324
|
-
|
|
322
|
+
self._executor, partial(self._func, **inputs)
|
|
323
|
+
)
|
|
324
|
+
|
|
325
325
|
# This should in theory never happen but as an extra precaution, let's check if the output
|
|
326
326
|
# is awaitable and await it if it is.
|
|
327
327
|
if inspect.isawaitable(output):
|
|
@@ -90,7 +90,9 @@ def _openai_api_list() -> Generator[Tuple[Any, Callable, bool], None, None]:
|
|
|
90
90
|
except ImportError:
|
|
91
91
|
raise MissingRequiredPackage("Please install the 'openai' package to use the Azure AI Evaluation SDK")
|
|
92
92
|
except AttributeError:
|
|
93
|
-
logging.warning(
|
|
93
|
+
logging.warning(
|
|
94
|
+
"The module '%s' does not have class '%s' or method '%s'", module_name, class_name, method_name
|
|
95
|
+
)
|
|
94
96
|
|
|
95
97
|
|
|
96
98
|
def inject_openai_api():
|
|
@@ -117,6 +119,7 @@ def recover_openai_api():
|
|
|
117
119
|
|
|
118
120
|
class CaptureOpenAITokenUsage:
|
|
119
121
|
"""Context manager to capture OpenAI token usage."""
|
|
122
|
+
|
|
120
123
|
def __init__(self):
|
|
121
124
|
self._tokens = TokenMetrics(0, 0, 0)
|
|
122
125
|
|
|
@@ -126,4 +129,4 @@ class CaptureOpenAITokenUsage:
|
|
|
126
129
|
|
|
127
130
|
def __exit__(self, exc_type: Optional[Exception], exc_value: Optional[Exception], traceback: Optional[Any]) -> None:
|
|
128
131
|
captured_metrics = _token_metrics.get()
|
|
129
|
-
self._tokens.update(captured_metrics)
|
|
132
|
+
self._tokens.update(captured_metrics)
|