azure-ai-evaluation 1.11.2__py3-none-any.whl → 1.13.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- azure/ai/evaluation/__init__.py +2 -0
- azure/ai/evaluation/_aoai/aoai_grader.py +69 -28
- azure/ai/evaluation/_aoai/label_grader.py +14 -13
- azure/ai/evaluation/_aoai/python_grader.py +15 -13
- azure/ai/evaluation/_aoai/score_model_grader.py +13 -10
- azure/ai/evaluation/_aoai/string_check_grader.py +13 -13
- azure/ai/evaluation/_aoai/text_similarity_grader.py +16 -25
- azure/ai/evaluation/_common/__init__.py +2 -1
- azure/ai/evaluation/_common/constants.py +109 -0
- azure/ai/evaluation/_common/evaluation_onedp_client.py +5 -5
- azure/ai/evaluation/_common/onedp/__init__.py +2 -2
- azure/ai/evaluation/_common/onedp/_client.py +44 -14
- azure/ai/evaluation/_common/onedp/_configuration.py +9 -7
- azure/ai/evaluation/_common/onedp/_utils/model_base.py +1 -1
- azure/ai/evaluation/_common/onedp/_validation.py +18 -2
- azure/ai/evaluation/_common/onedp/aio/__init__.py +2 -2
- azure/ai/evaluation/_common/onedp/aio/_client.py +44 -14
- azure/ai/evaluation/_common/onedp/aio/_configuration.py +9 -7
- azure/ai/evaluation/_common/onedp/aio/operations/__init__.py +12 -0
- azure/ai/evaluation/_common/onedp/aio/operations/_operations.py +3942 -1631
- azure/ai/evaluation/_common/onedp/models/__init__.py +196 -6
- azure/ai/evaluation/_common/onedp/models/_enums.py +217 -0
- azure/ai/evaluation/_common/onedp/models/_models.py +3876 -603
- azure/ai/evaluation/_common/onedp/operations/__init__.py +12 -0
- azure/ai/evaluation/_common/onedp/operations/_operations.py +5422 -2577
- azure/ai/evaluation/_common/rai_service.py +299 -2
- azure/ai/evaluation/_common/utils.py +173 -39
- azure/ai/evaluation/_constants.py +100 -0
- azure/ai/evaluation/_eval_mapping.py +10 -0
- azure/ai/evaluation/_evaluate/_batch_run/_run_submitter_client.py +10 -0
- azure/ai/evaluation/_evaluate/_evaluate.py +1125 -9
- azure/ai/evaluation/_evaluate/_evaluate_aoai.py +330 -51
- azure/ai/evaluation/_evaluate/_utils.py +17 -6
- azure/ai/evaluation/_evaluator_definition.py +76 -0
- azure/ai/evaluation/_evaluators/_bleu/_bleu.py +1 -0
- azure/ai/evaluation/_evaluators/_code_vulnerability/_code_vulnerability.py +0 -17
- azure/ai/evaluation/_evaluators/_coherence/_coherence.py +7 -1
- azure/ai/evaluation/_evaluators/_common/_base_eval.py +80 -4
- azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +181 -3
- azure/ai/evaluation/_evaluators/_fluency/_fluency.py +7 -1
- azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +28 -13
- azure/ai/evaluation/_evaluators/_groundedness/groundedness_without_query.prompty +19 -14
- azure/ai/evaluation/_evaluators/_intent_resolution/_intent_resolution.py +23 -4
- azure/ai/evaluation/_evaluators/_relevance/_relevance.py +21 -7
- azure/ai/evaluation/_evaluators/_relevance/relevance.prompty +36 -19
- azure/ai/evaluation/_evaluators/_response_completeness/_response_completeness.py +43 -20
- azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py +7 -1
- azure/ai/evaluation/_evaluators/_similarity/_similarity.py +7 -1
- azure/ai/evaluation/_evaluators/_task_adherence/_task_adherence.py +32 -6
- azure/ai/evaluation/_evaluators/_task_completion/__init__.py +7 -0
- azure/ai/evaluation/_evaluators/_task_completion/_task_completion.py +177 -0
- azure/ai/evaluation/_evaluators/_task_completion/task_completion.prompty +220 -0
- azure/ai/evaluation/_evaluators/_task_navigation_efficiency/__init__.py +7 -0
- azure/ai/evaluation/_evaluators/_task_navigation_efficiency/_task_navigation_efficiency.py +384 -0
- azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py +23 -127
- azure/ai/evaluation/_evaluators/_tool_input_accuracy/__init__.py +9 -0
- azure/ai/evaluation/_evaluators/_tool_input_accuracy/_tool_input_accuracy.py +263 -0
- azure/ai/evaluation/_evaluators/_tool_input_accuracy/tool_input_accuracy.prompty +76 -0
- azure/ai/evaluation/_evaluators/_tool_output_utilization/__init__.py +7 -0
- azure/ai/evaluation/_evaluators/_tool_output_utilization/_tool_output_utilization.py +225 -0
- azure/ai/evaluation/_evaluators/_tool_output_utilization/tool_output_utilization.prompty +221 -0
- azure/ai/evaluation/_evaluators/_tool_selection/__init__.py +9 -0
- azure/ai/evaluation/_evaluators/_tool_selection/_tool_selection.py +266 -0
- azure/ai/evaluation/_evaluators/_tool_selection/tool_selection.prompty +104 -0
- azure/ai/evaluation/_evaluators/_tool_success/__init__.py +7 -0
- azure/ai/evaluation/_evaluators/_tool_success/_tool_success.py +301 -0
- azure/ai/evaluation/_evaluators/_tool_success/tool_success.prompty +321 -0
- azure/ai/evaluation/_evaluators/_ungrounded_attributes/_ungrounded_attributes.py +0 -19
- azure/ai/evaluation/_exceptions.py +6 -0
- azure/ai/evaluation/_legacy/_batch_engine/_run_submitter.py +14 -1
- azure/ai/evaluation/_legacy/prompty/_prompty.py +2 -1
- azure/ai/evaluation/_legacy/prompty/_utils.py +54 -6
- azure/ai/evaluation/_model_configurations.py +26 -0
- azure/ai/evaluation/_version.py +1 -1
- azure/ai/evaluation/red_team/_attack_objective_generator.py +3 -1
- azure/ai/evaluation/red_team/_attack_strategy.py +1 -0
- azure/ai/evaluation/red_team/_callback_chat_target.py +45 -14
- azure/ai/evaluation/red_team/_evaluation_processor.py +129 -12
- azure/ai/evaluation/red_team/_mlflow_integration.py +144 -36
- azure/ai/evaluation/red_team/_orchestrator_manager.py +309 -51
- azure/ai/evaluation/red_team/_red_team.py +503 -37
- azure/ai/evaluation/red_team/_red_team_result.py +264 -15
- azure/ai/evaluation/red_team/_result_processor.py +953 -31
- azure/ai/evaluation/red_team/_utils/constants.py +1 -0
- azure/ai/evaluation/red_team/_utils/formatting_utils.py +126 -25
- azure/ai/evaluation/red_team/_utils/metric_mapping.py +10 -7
- azure/ai/evaluation/red_team/_utils/strategy_utils.py +3 -25
- azure/ai/evaluation/simulator/_adversarial_simulator.py +1 -1
- azure/ai/evaluation/simulator/_conversation/__init__.py +1 -1
- azure/ai/evaluation/simulator/_conversation/_conversation.py +1 -1
- azure/ai/evaluation/simulator/_direct_attack_simulator.py +1 -1
- azure/ai/evaluation/simulator/_indirect_attack_simulator.py +1 -1
- azure/ai/evaluation/simulator/_model_tools/_generated_rai_client.py +25 -2
- azure/ai/evaluation/simulator/_model_tools/_identity_manager.py +1 -0
- azure/ai/evaluation/simulator/_model_tools/_proxy_completion_model.py +1 -1
- azure/ai/evaluation/simulator/_model_tools/_template_handler.py +1 -1
- azure/ai/evaluation/simulator/_model_tools/models.py +1 -1
- {azure_ai_evaluation-1.11.2.dist-info → azure_ai_evaluation-1.13.0.dist-info}/METADATA +44 -10
- {azure_ai_evaluation-1.11.2.dist-info → azure_ai_evaluation-1.13.0.dist-info}/RECORD +102 -84
- {azure_ai_evaluation-1.11.2.dist-info → azure_ai_evaluation-1.13.0.dist-info}/WHEEL +0 -0
- {azure_ai_evaluation-1.11.2.dist-info → azure_ai_evaluation-1.13.0.dist-info}/licenses/NOTICE.txt +0 -0
- {azure_ai_evaluation-1.11.2.dist-info → azure_ai_evaluation-1.13.0.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,76 @@
|
|
|
1
|
+
from abc import ABC
|
|
2
|
+
from typing import Dict, List, Optional, Any
|
|
3
|
+
from dataclasses import dataclass, field
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
@dataclass
|
|
7
|
+
class EvaluatorMetric:
|
|
8
|
+
type: str = "ordinal"
|
|
9
|
+
desirable_direction: Optional[str] = None
|
|
10
|
+
min_value: Optional[float] = None
|
|
11
|
+
max_value: Optional[float] = None
|
|
12
|
+
|
|
13
|
+
def to_dict(self) -> Dict[str, Any]:
|
|
14
|
+
result = {"type": self.type}
|
|
15
|
+
if self.desirable_direction is not None:
|
|
16
|
+
result["desirable_direction"] = self.desirable_direction
|
|
17
|
+
if self.min_value is not None:
|
|
18
|
+
result["min_value"] = self.min_value
|
|
19
|
+
if self.max_value is not None:
|
|
20
|
+
result["max_value"] = self.max_value
|
|
21
|
+
return result
|
|
22
|
+
|
|
23
|
+
@classmethod
|
|
24
|
+
def from_dict(cls, data: Dict[str, Any]) -> "EvaluatorMetric":
|
|
25
|
+
return cls(
|
|
26
|
+
type=data.get("type", "ordinal"),
|
|
27
|
+
desirable_direction=data.get("desirable_direction"),
|
|
28
|
+
min_value=data.get("min_value"),
|
|
29
|
+
max_value=data.get("max_value"),
|
|
30
|
+
)
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
@dataclass
|
|
34
|
+
class ObjectParameterDescriptorWithRequired:
|
|
35
|
+
required: List[str] = field(default_factory=list)
|
|
36
|
+
type: str = "object"
|
|
37
|
+
properties: Dict[str, Any] = field(default_factory=dict)
|
|
38
|
+
|
|
39
|
+
def to_dict(self) -> Dict[str, Any]:
|
|
40
|
+
return {"required": self.required, "type": self.type, "properties": self.properties}
|
|
41
|
+
|
|
42
|
+
@classmethod
|
|
43
|
+
def from_dict(cls, data: Dict[str, Any]) -> "ObjectParameterDescriptorWithRequired":
|
|
44
|
+
return cls(
|
|
45
|
+
required=data.get("required", []), type=data.get("type", "object"), properties=data.get("properties", {})
|
|
46
|
+
)
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
class EvaluatorDefinition(ABC):
|
|
50
|
+
"""Base class for evaluator definitions"""
|
|
51
|
+
|
|
52
|
+
def __init__(self):
|
|
53
|
+
self.init_parameters: ObjectParameterDescriptorWithRequired = ObjectParameterDescriptorWithRequired()
|
|
54
|
+
self.metrics: Dict[str, EvaluatorMetric] = {}
|
|
55
|
+
self.data_schema: ObjectParameterDescriptorWithRequired = ObjectParameterDescriptorWithRequired()
|
|
56
|
+
self.type: str = "unknown"
|
|
57
|
+
|
|
58
|
+
def to_dict(self) -> Dict[str, Any]:
|
|
59
|
+
result = {
|
|
60
|
+
"type": self.type,
|
|
61
|
+
"init_parameters": self.init_parameters.to_dict(),
|
|
62
|
+
"metrics": {k: v.to_dict() for k, v in self.metrics.items()},
|
|
63
|
+
"data_schema": self.data_schema.to_dict(),
|
|
64
|
+
}
|
|
65
|
+
return result
|
|
66
|
+
|
|
67
|
+
@classmethod
|
|
68
|
+
def from_dict(cls, data: Dict[str, Any]) -> "EvaluatorDefinition":
|
|
69
|
+
# Create a generic instance since specific subclasses are not defined
|
|
70
|
+
instance = cls.__new__(cls)
|
|
71
|
+
instance.__init__()
|
|
72
|
+
|
|
73
|
+
instance.init_parameters = ObjectParameterDescriptorWithRequired.from_dict(data.get("init_parameters", {}))
|
|
74
|
+
instance.metrics = {k: EvaluatorMetric.from_dict(v) for k, v in data.get("metrics", {}).items()}
|
|
75
|
+
instance.data_schema = ObjectParameterDescriptorWithRequired.from_dict(data.get("data_schema", {}))
|
|
76
|
+
return instance
|
|
@@ -46,6 +46,7 @@ class BleuScoreEvaluator(EvaluatorBase):
|
|
|
46
46
|
https://{resource_name}.services.ai.azure.com/api/projects/{project_name}
|
|
47
47
|
|
|
48
48
|
.. admonition:: Example with Threshold:
|
|
49
|
+
|
|
49
50
|
.. literalinclude:: ../samples/evaluation_samples_threshold.py
|
|
50
51
|
:start-after: [START threshold_bleu_score_evaluator]
|
|
51
52
|
:end-before: [END threshold_bleu_score_evaluator]
|
|
@@ -56,23 +56,6 @@ class CodeVulnerabilityEvaluator(RaiServiceEvaluatorBase[Union[str, bool]]):
|
|
|
56
56
|
:param kwargs: Additional arguments to pass to the evaluator.
|
|
57
57
|
:type kwargs: Any
|
|
58
58
|
|
|
59
|
-
.. admonition:: Example:
|
|
60
|
-
|
|
61
|
-
.. literalinclude:: ../samples/evaluation_samples_evaluate.py
|
|
62
|
-
:start-after: [START code_vulnerability_evaluator]
|
|
63
|
-
:end-before: [END code_vulnerability_evaluator]
|
|
64
|
-
:language: python
|
|
65
|
-
:dedent: 8
|
|
66
|
-
:caption: Initialize and call CodeVulnerabilityEvaluator with a query and response using azure.ai.evaluation.AzureAIProject.
|
|
67
|
-
|
|
68
|
-
.. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
|
|
69
|
-
:start-after: [START code_vulnerability_evaluator]
|
|
70
|
-
:end-before: [END code_vulnerability_evaluator]
|
|
71
|
-
:language: python
|
|
72
|
-
:dedent: 8
|
|
73
|
-
:caption: Initialize and call CodeVulnerabilityEvaluator using Azure AI Project URL in following format
|
|
74
|
-
https://{resource_name}.services.ai.azure.com/api/projects/{project_name}
|
|
75
|
-
|
|
76
59
|
.. note::
|
|
77
60
|
|
|
78
61
|
If this evaluator is supplied to the `evaluate` function, the metric
|
|
@@ -23,6 +23,11 @@ class CoherenceEvaluator(PromptyEvaluatorBase[Union[str, float]]):
|
|
|
23
23
|
~azure.ai.evaluation.OpenAIModelConfiguration]
|
|
24
24
|
:param threshold: The threshold for the coherence evaluator. Default is 3.
|
|
25
25
|
:type threshold: int
|
|
26
|
+
:param credential: The credential for authenticating to Azure AI service.
|
|
27
|
+
:type credential: ~azure.core.credentials.TokenCredential
|
|
28
|
+
:keyword is_reasoning_model: If True, the evaluator will use reasoning model configuration (o1/o3 models).
|
|
29
|
+
This will adjust parameters like max_completion_tokens and remove unsupported parameters. Default is False.
|
|
30
|
+
:paramtype is_reasoning_model: bool
|
|
26
31
|
|
|
27
32
|
.. admonition:: Example:
|
|
28
33
|
|
|
@@ -66,7 +71,7 @@ class CoherenceEvaluator(PromptyEvaluatorBase[Union[str, float]]):
|
|
|
66
71
|
"""Evaluator identifier, experimental and to be used only with evaluation in cloud."""
|
|
67
72
|
|
|
68
73
|
@override
|
|
69
|
-
def __init__(self, model_config, *, threshold=3, credential=None):
|
|
74
|
+
def __init__(self, model_config, *, threshold=3, credential=None, **kwargs):
|
|
70
75
|
current_dir = os.path.dirname(__file__)
|
|
71
76
|
prompty_path = os.path.join(current_dir, self._PROMPTY_FILE)
|
|
72
77
|
self._threshold = threshold
|
|
@@ -78,6 +83,7 @@ class CoherenceEvaluator(PromptyEvaluatorBase[Union[str, float]]):
|
|
|
78
83
|
threshold=threshold,
|
|
79
84
|
credential=credential,
|
|
80
85
|
_higher_is_better=self._higher_is_better,
|
|
86
|
+
**kwargs,
|
|
81
87
|
)
|
|
82
88
|
|
|
83
89
|
@overload
|
|
@@ -4,12 +4,15 @@
|
|
|
4
4
|
|
|
5
5
|
import inspect
|
|
6
6
|
from abc import ABC, abstractmethod
|
|
7
|
+
import json
|
|
8
|
+
import copy
|
|
7
9
|
from typing import (
|
|
8
10
|
Any,
|
|
9
11
|
Callable,
|
|
10
12
|
Dict,
|
|
11
13
|
Generic,
|
|
12
14
|
List,
|
|
15
|
+
Tuple,
|
|
13
16
|
TypedDict,
|
|
14
17
|
TypeVar,
|
|
15
18
|
Union,
|
|
@@ -111,6 +114,7 @@ class EvaluatorBase(ABC, Generic[T_EvalValue]):
|
|
|
111
114
|
_NOT_APPLICABLE_RESULT = "not applicable"
|
|
112
115
|
_PASS_RESULT = "pass"
|
|
113
116
|
_FAIL_RESULT = "fail"
|
|
117
|
+
_type = "azure_ai_evaluator"
|
|
114
118
|
|
|
115
119
|
# ~~~ METHODS THAT ALMOST ALWAYS NEED TO BE OVERRIDDEN BY CHILDREN~~~
|
|
116
120
|
|
|
@@ -498,7 +502,7 @@ class EvaluatorBase(ABC, Generic[T_EvalValue]):
|
|
|
498
502
|
if message.get("role") == "assistant" and isinstance(message.get("content"), list):
|
|
499
503
|
for content_item in message.get("content"):
|
|
500
504
|
if isinstance(content_item, dict) and content_item.get("type") == "tool_call":
|
|
501
|
-
tool_calls.append(content_item)
|
|
505
|
+
tool_calls.append(copy.deepcopy(content_item))
|
|
502
506
|
|
|
503
507
|
# Extract tool results from tool messages
|
|
504
508
|
elif message.get("role") == "tool" and message.get("tool_call_id"):
|
|
@@ -516,6 +520,67 @@ class EvaluatorBase(ABC, Generic[T_EvalValue]):
|
|
|
516
520
|
|
|
517
521
|
return tool_calls
|
|
518
522
|
|
|
523
|
+
def _extract_tool_names_and_params_from_response(self, response) -> List[Tuple[str, Dict[str, str]]]:
|
|
524
|
+
"""Extract tool names and parameters from the response.
|
|
525
|
+
|
|
526
|
+
:param response: The response to parse.
|
|
527
|
+
:type response: Union[str, List[dict]]
|
|
528
|
+
:return: List of tuples containing (tool_name, parameters_dict) extracted from the response.
|
|
529
|
+
:rtype: List[Tuple[str, Dict[str, str]]]
|
|
530
|
+
"""
|
|
531
|
+
tool_calls = self._parse_tools_from_response(response)
|
|
532
|
+
tool_name_param_pairs = []
|
|
533
|
+
for tool_call in tool_calls:
|
|
534
|
+
if not isinstance(tool_call, dict):
|
|
535
|
+
raise EvaluationException(
|
|
536
|
+
"Tool call must be a dictionary.",
|
|
537
|
+
internal_message=str(tool_call),
|
|
538
|
+
target=ErrorTarget.EVALUATE,
|
|
539
|
+
category=ErrorCategory.UNKNOWN,
|
|
540
|
+
)
|
|
541
|
+
if tool_call.get("type") != "tool_call":
|
|
542
|
+
raise EvaluationException(
|
|
543
|
+
"Tool call must have 'type' set to 'tool_call'.",
|
|
544
|
+
internal_message=str(tool_call),
|
|
545
|
+
target=ErrorTarget.EVALUATE,
|
|
546
|
+
category=ErrorCategory.INVALID_VALUE,
|
|
547
|
+
)
|
|
548
|
+
|
|
549
|
+
if "name" not in tool_call:
|
|
550
|
+
raise EvaluationException(
|
|
551
|
+
"Tool call missing 'name' field.",
|
|
552
|
+
internal_message=str(tool_call),
|
|
553
|
+
target=ErrorTarget.EVALUATE,
|
|
554
|
+
category=ErrorCategory.MISSING_FIELD,
|
|
555
|
+
)
|
|
556
|
+
|
|
557
|
+
tool_name = str(tool_call["name"]).strip()
|
|
558
|
+
|
|
559
|
+
# Extract parameters/arguments
|
|
560
|
+
parameters = {}
|
|
561
|
+
if "arguments" in tool_call:
|
|
562
|
+
args = tool_call["arguments"]
|
|
563
|
+
if isinstance(args, dict):
|
|
564
|
+
# Convert all values to strings for consistent comparison
|
|
565
|
+
parameters = {str(k): str(v) for k, v in args.items()}
|
|
566
|
+
elif isinstance(args, str):
|
|
567
|
+
# If arguments is a string, try to parse it as JSON
|
|
568
|
+
try:
|
|
569
|
+
parsed_args = json.loads(args)
|
|
570
|
+
if isinstance(parsed_args, dict):
|
|
571
|
+
parameters = {str(k): str(v) for k, v in parsed_args.items()}
|
|
572
|
+
except json.JSONDecodeError:
|
|
573
|
+
raise EvaluationException(
|
|
574
|
+
"Failed to parse tool call arguments as JSON.",
|
|
575
|
+
internal_message=str(tool_call),
|
|
576
|
+
target=ErrorTarget.EVALUATE,
|
|
577
|
+
category=ErrorCategory.INVALID_VALUE,
|
|
578
|
+
)
|
|
579
|
+
|
|
580
|
+
tool_name_param_pairs.append((tool_name, parameters))
|
|
581
|
+
|
|
582
|
+
return tool_name_param_pairs
|
|
583
|
+
|
|
519
584
|
async def _real_call(self, **kwargs) -> Union[DoEvalResult[T_EvalValue], AggregateResult[T_EvalValue]]:
|
|
520
585
|
"""The asynchronous call where real end-to-end evaluation logic is performed.
|
|
521
586
|
|
|
@@ -542,14 +607,25 @@ class EvaluatorBase(ABC, Generic[T_EvalValue]):
|
|
|
542
607
|
base_key = key[:-6] # Remove "_score" suffix
|
|
543
608
|
result_key = f"{base_key}_result"
|
|
544
609
|
threshold_key = f"{base_key}_threshold"
|
|
545
|
-
|
|
610
|
+
threshold_value = (
|
|
611
|
+
self._threshold.get(base_key) if isinstance(self._threshold, dict) else self._threshold
|
|
612
|
+
)
|
|
613
|
+
if not isinstance(threshold_value, (int, float)):
|
|
614
|
+
raise EvaluationException(
|
|
615
|
+
"Threshold value must be a number.",
|
|
616
|
+
internal_message=str(threshold_value),
|
|
617
|
+
target=ErrorTarget.EVALUATE,
|
|
618
|
+
category=ErrorCategory.INVALID_VALUE,
|
|
619
|
+
)
|
|
620
|
+
|
|
621
|
+
result[threshold_key] = threshold_value
|
|
546
622
|
if self._higher_is_better:
|
|
547
|
-
if float(score_value) >=
|
|
623
|
+
if float(score_value) >= threshold_value:
|
|
548
624
|
result[result_key] = EVALUATION_PASS_FAIL_MAPPING[True]
|
|
549
625
|
else:
|
|
550
626
|
result[result_key] = EVALUATION_PASS_FAIL_MAPPING[False]
|
|
551
627
|
else:
|
|
552
|
-
if float(score_value) <=
|
|
628
|
+
if float(score_value) <= threshold_value:
|
|
553
629
|
result[result_key] = EVALUATION_PASS_FAIL_MAPPING[True]
|
|
554
630
|
else:
|
|
555
631
|
result[result_key] = EVALUATION_PASS_FAIL_MAPPING[False]
|
|
@@ -5,7 +5,8 @@
|
|
|
5
5
|
import math
|
|
6
6
|
import re
|
|
7
7
|
import os
|
|
8
|
-
from
|
|
8
|
+
from itertools import chain
|
|
9
|
+
from typing import Dict, Optional, TypeVar, Union, List
|
|
9
10
|
|
|
10
11
|
if os.getenv("AI_EVALS_USE_PF_PROMPTY", "false").lower() == "true":
|
|
11
12
|
from promptflow.core._flow import AsyncPrompty
|
|
@@ -132,10 +133,19 @@ class PromptyEvaluatorBase(EvaluatorBase[T]):
|
|
|
132
133
|
category=ErrorCategory.INVALID_VALUE,
|
|
133
134
|
target=ErrorTarget.CONVERSATION,
|
|
134
135
|
)
|
|
135
|
-
|
|
136
|
+
# Call the prompty flow to get the evaluation result.
|
|
137
|
+
prompty_output_dict = await self._flow(timeout=self._LLM_CALL_TIMEOUT, **eval_input)
|
|
136
138
|
|
|
137
139
|
score = math.nan
|
|
138
|
-
if
|
|
140
|
+
if prompty_output_dict:
|
|
141
|
+
llm_output = prompty_output_dict.get("llm_output", "")
|
|
142
|
+
input_token_count = prompty_output_dict.get("input_token_count", 0)
|
|
143
|
+
output_token_count = prompty_output_dict.get("output_token_count", 0)
|
|
144
|
+
total_token_count = prompty_output_dict.get("total_token_count", 0)
|
|
145
|
+
finish_reason = prompty_output_dict.get("finish_reason", "")
|
|
146
|
+
model_id = prompty_output_dict.get("model_id", "")
|
|
147
|
+
sample_input = prompty_output_dict.get("sample_input", "")
|
|
148
|
+
sample_output = prompty_output_dict.get("sample_output", "")
|
|
139
149
|
# Parse out score and reason from evaluators known to possess them.
|
|
140
150
|
if self._result_key in PROMPT_BASED_REASON_EVALUATORS:
|
|
141
151
|
score, reason = parse_quality_evaluator_reason_score(llm_output)
|
|
@@ -146,6 +156,13 @@ class PromptyEvaluatorBase(EvaluatorBase[T]):
|
|
|
146
156
|
f"{self._result_key}_reason": reason,
|
|
147
157
|
f"{self._result_key}_result": binary_result,
|
|
148
158
|
f"{self._result_key}_threshold": self._threshold,
|
|
159
|
+
f"{self._result_key}_prompt_tokens": input_token_count,
|
|
160
|
+
f"{self._result_key}_completion_tokens": output_token_count,
|
|
161
|
+
f"{self._result_key}_total_tokens": total_token_count,
|
|
162
|
+
f"{self._result_key}_finish_reason": finish_reason,
|
|
163
|
+
f"{self._result_key}_model": model_id,
|
|
164
|
+
f"{self._result_key}_sample_input": sample_input,
|
|
165
|
+
f"{self._result_key}_sample_output": sample_output,
|
|
149
166
|
}
|
|
150
167
|
match = re.search(r"\d", llm_output)
|
|
151
168
|
if match:
|
|
@@ -156,6 +173,13 @@ class PromptyEvaluatorBase(EvaluatorBase[T]):
|
|
|
156
173
|
f"gpt_{self._result_key}": float(score),
|
|
157
174
|
f"{self._result_key}_result": binary_result,
|
|
158
175
|
f"{self._result_key}_threshold": self._threshold,
|
|
176
|
+
f"{self._result_key}_prompt_tokens": input_token_count,
|
|
177
|
+
f"{self._result_key}_completion_tokens": output_token_count,
|
|
178
|
+
f"{self._result_key}_total_tokens": total_token_count,
|
|
179
|
+
f"{self._result_key}_finish_reason": finish_reason,
|
|
180
|
+
f"{self._result_key}_model": model_id,
|
|
181
|
+
f"{self._result_key}_sample_input": sample_input,
|
|
182
|
+
f"{self._result_key}_sample_output": sample_output,
|
|
159
183
|
}
|
|
160
184
|
|
|
161
185
|
binary_result = self._get_binary_result(score)
|
|
@@ -165,3 +189,157 @@ class PromptyEvaluatorBase(EvaluatorBase[T]):
|
|
|
165
189
|
f"{self._result_key}_result": binary_result,
|
|
166
190
|
f"{self._result_key}_threshold": self._threshold,
|
|
167
191
|
}
|
|
192
|
+
|
|
193
|
+
@staticmethod
|
|
194
|
+
def _get_built_in_tool_definition(tool_name: str):
|
|
195
|
+
"""Get the definition for the built-in tool."""
|
|
196
|
+
try:
|
|
197
|
+
from ..._converters._models import _BUILT_IN_DESCRIPTIONS, _BUILT_IN_PARAMS
|
|
198
|
+
|
|
199
|
+
if tool_name in _BUILT_IN_DESCRIPTIONS:
|
|
200
|
+
return {
|
|
201
|
+
"type": tool_name,
|
|
202
|
+
"description": _BUILT_IN_DESCRIPTIONS[tool_name],
|
|
203
|
+
"name": tool_name,
|
|
204
|
+
"parameters": _BUILT_IN_PARAMS.get(tool_name, {}),
|
|
205
|
+
}
|
|
206
|
+
except ImportError:
|
|
207
|
+
pass
|
|
208
|
+
return None
|
|
209
|
+
|
|
210
|
+
def _get_needed_built_in_tool_definitions(self, tool_calls: List[Dict]) -> List[Dict]:
|
|
211
|
+
"""Extract tool definitions needed for the given built-in tool calls."""
|
|
212
|
+
needed_definitions = []
|
|
213
|
+
for tool_call in tool_calls:
|
|
214
|
+
if isinstance(tool_call, dict):
|
|
215
|
+
tool_type = tool_call.get("type")
|
|
216
|
+
|
|
217
|
+
# Only support converter format: {type: "tool_call", name: "bing_custom_search", arguments: {...}}
|
|
218
|
+
if tool_type == "tool_call":
|
|
219
|
+
tool_name = tool_call.get("name")
|
|
220
|
+
if tool_name:
|
|
221
|
+
definition = self._get_built_in_tool_definition(tool_name)
|
|
222
|
+
if definition and definition not in needed_definitions:
|
|
223
|
+
needed_definitions.append(definition)
|
|
224
|
+
|
|
225
|
+
return needed_definitions
|
|
226
|
+
|
|
227
|
+
def _extract_tool_names_from_calls(self, tool_calls: List[Dict]) -> List[str]:
|
|
228
|
+
"""Extract just the tool names from tool calls, removing parameters."""
|
|
229
|
+
tool_names = []
|
|
230
|
+
for tool_call in tool_calls:
|
|
231
|
+
if isinstance(tool_call, dict):
|
|
232
|
+
tool_type = tool_call.get("type")
|
|
233
|
+
if tool_type == "tool_call":
|
|
234
|
+
tool_name = tool_call.get("name")
|
|
235
|
+
if tool_name:
|
|
236
|
+
tool_names.append(tool_name)
|
|
237
|
+
elif tool_call.get("function", {}).get("name"):
|
|
238
|
+
# Handle function call format
|
|
239
|
+
tool_names.append(tool_call["function"]["name"])
|
|
240
|
+
elif tool_call.get("name"):
|
|
241
|
+
# Handle direct name format
|
|
242
|
+
tool_names.append(tool_call["name"])
|
|
243
|
+
return tool_names
|
|
244
|
+
|
|
245
|
+
def _extract_needed_tool_definitions(
|
|
246
|
+
self, tool_calls: List[Dict], tool_definitions: List[Dict], error_target: ErrorTarget
|
|
247
|
+
) -> List[Dict]:
|
|
248
|
+
"""Extract the tool definitions that are needed for the provided tool calls.
|
|
249
|
+
|
|
250
|
+
:param tool_calls: The tool calls that need definitions
|
|
251
|
+
:type tool_calls: List[Dict]
|
|
252
|
+
:param tool_definitions: User-provided tool definitions
|
|
253
|
+
:type tool_definitions: List[Dict]
|
|
254
|
+
:param error_target: The evaluator-specific error target for exceptions
|
|
255
|
+
:type error_target: ErrorTarget
|
|
256
|
+
:return: List of needed tool definitions
|
|
257
|
+
:rtype: List[Dict]
|
|
258
|
+
:raises EvaluationException: If validation fails
|
|
259
|
+
"""
|
|
260
|
+
needed_tool_definitions = []
|
|
261
|
+
|
|
262
|
+
# Add all user-provided tool definitions
|
|
263
|
+
needed_tool_definitions.extend(tool_definitions)
|
|
264
|
+
|
|
265
|
+
# Add the needed built-in tool definitions (if they are called)
|
|
266
|
+
built_in_definitions = self._get_needed_built_in_tool_definitions(tool_calls)
|
|
267
|
+
needed_tool_definitions.extend(built_in_definitions)
|
|
268
|
+
|
|
269
|
+
# OpenAPI tool is a collection of functions, so we need to expand it
|
|
270
|
+
tool_definitions_expanded = list(
|
|
271
|
+
chain.from_iterable(
|
|
272
|
+
tool.get("functions", []) if tool.get("type") == "openapi" else [tool]
|
|
273
|
+
for tool in needed_tool_definitions
|
|
274
|
+
)
|
|
275
|
+
)
|
|
276
|
+
|
|
277
|
+
# Validate that all tool calls have corresponding definitions
|
|
278
|
+
for tool_call in tool_calls:
|
|
279
|
+
if isinstance(tool_call, dict):
|
|
280
|
+
tool_type = tool_call.get("type")
|
|
281
|
+
|
|
282
|
+
if tool_type == "tool_call":
|
|
283
|
+
tool_name = tool_call.get("name")
|
|
284
|
+
if tool_name and self._get_built_in_tool_definition(tool_name):
|
|
285
|
+
# This is a built-in tool from converter, already handled above
|
|
286
|
+
continue
|
|
287
|
+
elif tool_name:
|
|
288
|
+
# This is a regular function tool from converter
|
|
289
|
+
tool_definition_exists = any(
|
|
290
|
+
tool.get("name") == tool_name and tool.get("type", "function") == "function"
|
|
291
|
+
for tool in tool_definitions_expanded
|
|
292
|
+
)
|
|
293
|
+
if not tool_definition_exists:
|
|
294
|
+
raise EvaluationException(
|
|
295
|
+
message=f"Tool definition for {tool_name} not found",
|
|
296
|
+
blame=ErrorBlame.USER_ERROR,
|
|
297
|
+
category=ErrorCategory.INVALID_VALUE,
|
|
298
|
+
target=error_target,
|
|
299
|
+
)
|
|
300
|
+
else:
|
|
301
|
+
raise EvaluationException(
|
|
302
|
+
message=f"Tool call missing name: {tool_call}",
|
|
303
|
+
blame=ErrorBlame.USER_ERROR,
|
|
304
|
+
category=ErrorCategory.INVALID_VALUE,
|
|
305
|
+
target=error_target,
|
|
306
|
+
)
|
|
307
|
+
else:
|
|
308
|
+
# Unsupported tool format - only converter format is supported
|
|
309
|
+
raise EvaluationException(
|
|
310
|
+
message=f"Unsupported tool call format. Only converter format is supported: {tool_call}",
|
|
311
|
+
blame=ErrorBlame.USER_ERROR,
|
|
312
|
+
category=ErrorCategory.INVALID_VALUE,
|
|
313
|
+
target=error_target,
|
|
314
|
+
)
|
|
315
|
+
else:
|
|
316
|
+
# Tool call is not a dictionary
|
|
317
|
+
raise EvaluationException(
|
|
318
|
+
message=f"Tool call is not a dictionary: {tool_call}",
|
|
319
|
+
blame=ErrorBlame.USER_ERROR,
|
|
320
|
+
category=ErrorCategory.INVALID_VALUE,
|
|
321
|
+
target=error_target,
|
|
322
|
+
)
|
|
323
|
+
|
|
324
|
+
return needed_tool_definitions
|
|
325
|
+
|
|
326
|
+
def _not_applicable_result(
|
|
327
|
+
self, error_message: str, threshold: Union[int, float]
|
|
328
|
+
) -> Dict[str, Union[str, float, Dict]]:
|
|
329
|
+
"""Return a result indicating that the evaluation is not applicable.
|
|
330
|
+
|
|
331
|
+
:param error_message: The error message explaining why evaluation is not applicable.
|
|
332
|
+
:type error_message: str
|
|
333
|
+
:param threshold: The threshold value for the evaluator.
|
|
334
|
+
:type threshold: Union[int, float]
|
|
335
|
+
:return: A dictionary containing the result of the evaluation.
|
|
336
|
+
:rtype: Dict[str, Union[str, float, Dict]]
|
|
337
|
+
"""
|
|
338
|
+
# If no tool calls were made or tool call type is not supported, return not applicable result
|
|
339
|
+
return {
|
|
340
|
+
self._result_key: self._NOT_APPLICABLE_RESULT,
|
|
341
|
+
f"{self._result_key}_result": "pass",
|
|
342
|
+
f"{self._result_key}_threshold": threshold,
|
|
343
|
+
f"{self._result_key}_reason": error_message,
|
|
344
|
+
f"{self._result_key}_details": {},
|
|
345
|
+
}
|
|
@@ -25,6 +25,11 @@ class FluencyEvaluator(PromptyEvaluatorBase[Union[str, float]]):
|
|
|
25
25
|
~azure.ai.evaluation.OpenAIModelConfiguration]
|
|
26
26
|
:param threshold: The threshold for the fluency evaluator. Default is 3.
|
|
27
27
|
:type threshold: int
|
|
28
|
+
:param credential: The credential for authenticating to Azure AI service.
|
|
29
|
+
:type credential: ~azure.core.credentials.TokenCredential
|
|
30
|
+
:keyword is_reasoning_model: If True, the evaluator will use reasoning model configuration (o1/o3 models).
|
|
31
|
+
This will adjust parameters like max_completion_tokens and remove unsupported parameters. Default is False.
|
|
32
|
+
:paramtype is_reasoning_model: bool
|
|
28
33
|
|
|
29
34
|
.. admonition:: Example:
|
|
30
35
|
|
|
@@ -68,7 +73,7 @@ class FluencyEvaluator(PromptyEvaluatorBase[Union[str, float]]):
|
|
|
68
73
|
"""Evaluator identifier, experimental and to be used only with evaluation in cloud."""
|
|
69
74
|
|
|
70
75
|
@override
|
|
71
|
-
def __init__(self, model_config, *, credential=None, threshold=3):
|
|
76
|
+
def __init__(self, model_config, *, credential=None, threshold=3, **kwargs):
|
|
72
77
|
current_dir = os.path.dirname(__file__)
|
|
73
78
|
prompty_path = os.path.join(current_dir, self._PROMPTY_FILE)
|
|
74
79
|
self._threshold = threshold
|
|
@@ -80,6 +85,7 @@ class FluencyEvaluator(PromptyEvaluatorBase[Union[str, float]]):
|
|
|
80
85
|
threshold=threshold,
|
|
81
86
|
credential=credential,
|
|
82
87
|
_higher_is_better=self._higher_is_better,
|
|
88
|
+
**kwargs,
|
|
83
89
|
)
|
|
84
90
|
|
|
85
91
|
@overload
|
|
@@ -5,7 +5,7 @@ import os, logging
|
|
|
5
5
|
from typing import Dict, List, Optional, Union, Any, Tuple
|
|
6
6
|
|
|
7
7
|
from typing_extensions import overload, override
|
|
8
|
-
from azure.ai.evaluation._legacy.
|
|
8
|
+
from azure.ai.evaluation._legacy.prompty import AsyncPrompty
|
|
9
9
|
|
|
10
10
|
from azure.ai.evaluation._evaluators._common import PromptyEvaluatorBase
|
|
11
11
|
from azure.ai.evaluation._model_configurations import Conversation
|
|
@@ -33,8 +33,7 @@ logger = logging.getLogger(__name__)
|
|
|
33
33
|
|
|
34
34
|
|
|
35
35
|
class GroundednessEvaluator(PromptyEvaluatorBase[Union[str, float]]):
|
|
36
|
-
"""
|
|
37
|
-
Evaluates groundedness score for a given query (optional), response, and context or a multi-turn conversation,
|
|
36
|
+
"""Evaluates groundedness score for a given query (optional), response, and context or a multi-turn conversation,
|
|
38
37
|
including reasoning.
|
|
39
38
|
|
|
40
39
|
The groundedness measure assesses the correspondence between claims in an AI-generated answer and the source
|
|
@@ -50,6 +49,11 @@ class GroundednessEvaluator(PromptyEvaluatorBase[Union[str, float]]):
|
|
|
50
49
|
~azure.ai.evaluation.OpenAIModelConfiguration]
|
|
51
50
|
:param threshold: The threshold for the groundedness evaluator. Default is 3.
|
|
52
51
|
:type threshold: int
|
|
52
|
+
:param credential: The credential for authenticating to Azure AI service.
|
|
53
|
+
:type credential: ~azure.core.credentials.TokenCredential
|
|
54
|
+
:keyword is_reasoning_model: If True, the evaluator will use reasoning model configuration (o1/o3 models).
|
|
55
|
+
This will adjust parameters like max_completion_tokens and remove unsupported parameters. Default is False.
|
|
56
|
+
:paramtype is_reasoning_model: bool
|
|
53
57
|
|
|
54
58
|
.. admonition:: Example:
|
|
55
59
|
|
|
@@ -61,6 +65,7 @@ class GroundednessEvaluator(PromptyEvaluatorBase[Union[str, float]]):
|
|
|
61
65
|
:caption: Initialize and call a GroundednessEvaluator.
|
|
62
66
|
|
|
63
67
|
.. admonition:: Example with Threshold:
|
|
68
|
+
|
|
64
69
|
.. literalinclude:: ../samples/evaluation_samples_threshold.py
|
|
65
70
|
:start-after: [START threshold_groundedness_evaluator]
|
|
66
71
|
:end-before: [END threshold_groundedness_evaluator]
|
|
@@ -107,6 +112,7 @@ class GroundednessEvaluator(PromptyEvaluatorBase[Union[str, float]]):
|
|
|
107
112
|
threshold=threshold,
|
|
108
113
|
credential=credential,
|
|
109
114
|
_higher_is_better=self._higher_is_better,
|
|
115
|
+
**kwargs,
|
|
110
116
|
)
|
|
111
117
|
self._model_config = model_config
|
|
112
118
|
self.threshold = threshold
|
|
@@ -196,18 +202,24 @@ class GroundednessEvaluator(PromptyEvaluatorBase[Union[str, float]]):
|
|
|
196
202
|
"""
|
|
197
203
|
|
|
198
204
|
if kwargs.get("query", None):
|
|
199
|
-
|
|
200
|
-
prompty_path = os.path.join(current_dir, self._PROMPTY_FILE_WITH_QUERY)
|
|
201
|
-
self._prompty_file = prompty_path
|
|
202
|
-
prompty_model_config = construct_prompty_model_config(
|
|
203
|
-
validate_model_config(self._model_config),
|
|
204
|
-
self._DEFAULT_OPEN_API_VERSION,
|
|
205
|
-
UserAgentSingleton().value,
|
|
206
|
-
)
|
|
207
|
-
self._flow = AsyncPrompty.load(source=self._prompty_file, model=prompty_model_config)
|
|
205
|
+
self._ensure_query_prompty_loaded()
|
|
208
206
|
|
|
209
207
|
return super().__call__(*args, **kwargs)
|
|
210
208
|
|
|
209
|
+
def _ensure_query_prompty_loaded(self):
|
|
210
|
+
"""Switch to the query prompty file if not already loaded."""
|
|
211
|
+
|
|
212
|
+
current_dir = os.path.dirname(__file__)
|
|
213
|
+
prompty_path = os.path.join(current_dir, self._PROMPTY_FILE_WITH_QUERY)
|
|
214
|
+
|
|
215
|
+
self._prompty_file = prompty_path
|
|
216
|
+
prompty_model_config = construct_prompty_model_config(
|
|
217
|
+
validate_model_config(self._model_config),
|
|
218
|
+
self._DEFAULT_OPEN_API_VERSION,
|
|
219
|
+
UserAgentSingleton().value,
|
|
220
|
+
)
|
|
221
|
+
self._flow = AsyncPrompty.load(source=self._prompty_file, model=prompty_model_config)
|
|
222
|
+
|
|
211
223
|
def _has_context(self, eval_input: dict) -> bool:
|
|
212
224
|
"""
|
|
213
225
|
Return True if eval_input contains a non-empty 'context' field.
|
|
@@ -226,7 +238,7 @@ class GroundednessEvaluator(PromptyEvaluatorBase[Union[str, float]]):
|
|
|
226
238
|
|
|
227
239
|
@override
|
|
228
240
|
async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]:
|
|
229
|
-
if "query"
|
|
241
|
+
if eval_input.get("query", None) is None:
|
|
230
242
|
return await super()._do_eval(eval_input)
|
|
231
243
|
|
|
232
244
|
contains_context = self._has_context(eval_input)
|
|
@@ -273,6 +285,9 @@ class GroundednessEvaluator(PromptyEvaluatorBase[Union[str, float]]):
|
|
|
273
285
|
response = kwargs.get("response")
|
|
274
286
|
tool_definitions = kwargs.get("tool_definitions")
|
|
275
287
|
|
|
288
|
+
if query and self._prompty_file != self._PROMPTY_FILE_WITH_QUERY:
|
|
289
|
+
self._ensure_query_prompty_loaded()
|
|
290
|
+
|
|
276
291
|
if (not query) or (not response): # or not tool_definitions:
|
|
277
292
|
msg = f"{type(self).__name__}: Either 'conversation' or individual inputs must be provided. For Agent groundedness 'query' and 'response' are required."
|
|
278
293
|
raise EvaluationException(
|