azure-ai-evaluation 1.0.1__py3-none-any.whl → 1.13.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of azure-ai-evaluation might be problematic. Click here for more details.
- azure/ai/evaluation/__init__.py +83 -14
- azure/ai/evaluation/_aoai/__init__.py +10 -0
- azure/ai/evaluation/_aoai/aoai_grader.py +140 -0
- azure/ai/evaluation/_aoai/label_grader.py +68 -0
- azure/ai/evaluation/_aoai/python_grader.py +86 -0
- azure/ai/evaluation/_aoai/score_model_grader.py +94 -0
- azure/ai/evaluation/_aoai/string_check_grader.py +66 -0
- azure/ai/evaluation/_aoai/text_similarity_grader.py +80 -0
- azure/ai/evaluation/_azure/__init__.py +3 -0
- azure/ai/evaluation/_azure/_clients.py +204 -0
- azure/ai/evaluation/_azure/_envs.py +207 -0
- azure/ai/evaluation/_azure/_models.py +227 -0
- azure/ai/evaluation/_azure/_token_manager.py +129 -0
- azure/ai/evaluation/_common/__init__.py +9 -1
- azure/ai/evaluation/_common/constants.py +124 -2
- azure/ai/evaluation/_common/evaluation_onedp_client.py +169 -0
- azure/ai/evaluation/_common/onedp/__init__.py +32 -0
- azure/ai/evaluation/_common/onedp/_client.py +166 -0
- azure/ai/evaluation/_common/onedp/_configuration.py +72 -0
- azure/ai/evaluation/_common/onedp/_model_base.py +1232 -0
- azure/ai/evaluation/_common/onedp/_patch.py +21 -0
- azure/ai/evaluation/_common/onedp/_serialization.py +2032 -0
- azure/ai/evaluation/_common/onedp/_types.py +21 -0
- azure/ai/evaluation/_common/onedp/_utils/__init__.py +6 -0
- azure/ai/evaluation/_common/onedp/_utils/model_base.py +1232 -0
- azure/ai/evaluation/_common/onedp/_utils/serialization.py +2032 -0
- azure/ai/evaluation/_common/onedp/_validation.py +66 -0
- azure/ai/evaluation/_common/onedp/_vendor.py +50 -0
- azure/ai/evaluation/_common/onedp/_version.py +9 -0
- azure/ai/evaluation/_common/onedp/aio/__init__.py +29 -0
- azure/ai/evaluation/_common/onedp/aio/_client.py +168 -0
- azure/ai/evaluation/_common/onedp/aio/_configuration.py +72 -0
- azure/ai/evaluation/_common/onedp/aio/_patch.py +21 -0
- azure/ai/evaluation/_common/onedp/aio/operations/__init__.py +49 -0
- azure/ai/evaluation/_common/onedp/aio/operations/_operations.py +7143 -0
- azure/ai/evaluation/_common/onedp/aio/operations/_patch.py +21 -0
- azure/ai/evaluation/_common/onedp/models/__init__.py +358 -0
- azure/ai/evaluation/_common/onedp/models/_enums.py +447 -0
- azure/ai/evaluation/_common/onedp/models/_models.py +5963 -0
- azure/ai/evaluation/_common/onedp/models/_patch.py +21 -0
- azure/ai/evaluation/_common/onedp/operations/__init__.py +49 -0
- azure/ai/evaluation/_common/onedp/operations/_operations.py +8951 -0
- azure/ai/evaluation/_common/onedp/operations/_patch.py +21 -0
- azure/ai/evaluation/_common/onedp/py.typed +1 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/__init__.py +1 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/aio/__init__.py +1 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/aio/operations/__init__.py +25 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/aio/operations/_operations.py +34 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/aio/operations/_patch.py +20 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/__init__.py +1 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/__init__.py +1 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/operations/__init__.py +22 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/operations/_operations.py +29 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/operations/_patch.py +20 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/operations/__init__.py +22 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/operations/_operations.py +29 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/operations/_patch.py +20 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/operations/__init__.py +25 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/operations/_operations.py +34 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/operations/_patch.py +20 -0
- azure/ai/evaluation/_common/rai_service.py +578 -69
- azure/ai/evaluation/_common/raiclient/__init__.py +34 -0
- azure/ai/evaluation/_common/raiclient/_client.py +128 -0
- azure/ai/evaluation/_common/raiclient/_configuration.py +87 -0
- azure/ai/evaluation/_common/raiclient/_model_base.py +1235 -0
- azure/ai/evaluation/_common/raiclient/_patch.py +20 -0
- azure/ai/evaluation/_common/raiclient/_serialization.py +2050 -0
- azure/ai/evaluation/_common/raiclient/_version.py +9 -0
- azure/ai/evaluation/_common/raiclient/aio/__init__.py +29 -0
- azure/ai/evaluation/_common/raiclient/aio/_client.py +130 -0
- azure/ai/evaluation/_common/raiclient/aio/_configuration.py +87 -0
- azure/ai/evaluation/_common/raiclient/aio/_patch.py +20 -0
- azure/ai/evaluation/_common/raiclient/aio/operations/__init__.py +25 -0
- azure/ai/evaluation/_common/raiclient/aio/operations/_operations.py +981 -0
- azure/ai/evaluation/_common/raiclient/aio/operations/_patch.py +20 -0
- azure/ai/evaluation/_common/raiclient/models/__init__.py +60 -0
- azure/ai/evaluation/_common/raiclient/models/_enums.py +18 -0
- azure/ai/evaluation/_common/raiclient/models/_models.py +651 -0
- azure/ai/evaluation/_common/raiclient/models/_patch.py +20 -0
- azure/ai/evaluation/_common/raiclient/operations/__init__.py +25 -0
- azure/ai/evaluation/_common/raiclient/operations/_operations.py +1238 -0
- azure/ai/evaluation/_common/raiclient/operations/_patch.py +20 -0
- azure/ai/evaluation/_common/raiclient/py.typed +1 -0
- azure/ai/evaluation/_common/utils.py +505 -27
- azure/ai/evaluation/_constants.py +148 -0
- azure/ai/evaluation/_converters/__init__.py +3 -0
- azure/ai/evaluation/_converters/_ai_services.py +899 -0
- azure/ai/evaluation/_converters/_models.py +467 -0
- azure/ai/evaluation/_converters/_sk_services.py +495 -0
- azure/ai/evaluation/_eval_mapping.py +83 -0
- azure/ai/evaluation/_evaluate/_batch_run/__init__.py +10 -2
- azure/ai/evaluation/_evaluate/_batch_run/_run_submitter_client.py +176 -0
- azure/ai/evaluation/_evaluate/_batch_run/batch_clients.py +82 -0
- azure/ai/evaluation/_evaluate/_batch_run/code_client.py +18 -12
- azure/ai/evaluation/_evaluate/_batch_run/eval_run_context.py +19 -6
- azure/ai/evaluation/_evaluate/_batch_run/proxy_client.py +47 -22
- azure/ai/evaluation/_evaluate/_batch_run/target_run_context.py +18 -2
- azure/ai/evaluation/_evaluate/_eval_run.py +32 -46
- azure/ai/evaluation/_evaluate/_evaluate.py +1809 -142
- azure/ai/evaluation/_evaluate/_evaluate_aoai.py +992 -0
- azure/ai/evaluation/_evaluate/_telemetry/__init__.py +5 -90
- azure/ai/evaluation/_evaluate/_utils.py +237 -42
- azure/ai/evaluation/_evaluator_definition.py +76 -0
- azure/ai/evaluation/_evaluators/_bleu/_bleu.py +80 -28
- azure/ai/evaluation/_evaluators/_code_vulnerability/__init__.py +5 -0
- azure/ai/evaluation/_evaluators/_code_vulnerability/_code_vulnerability.py +119 -0
- azure/ai/evaluation/_evaluators/_coherence/_coherence.py +40 -4
- azure/ai/evaluation/_evaluators/_common/__init__.py +2 -0
- azure/ai/evaluation/_evaluators/_common/_base_eval.py +427 -29
- azure/ai/evaluation/_evaluators/_common/_base_multi_eval.py +63 -0
- azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +269 -12
- azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py +74 -9
- azure/ai/evaluation/_evaluators/_common/_conversation_aggregators.py +49 -0
- azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py +73 -53
- azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py +35 -5
- azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py +26 -5
- azure/ai/evaluation/_evaluators/_content_safety/_sexual.py +35 -5
- azure/ai/evaluation/_evaluators/_content_safety/_violence.py +34 -4
- azure/ai/evaluation/_evaluators/_document_retrieval/__init__.py +7 -0
- azure/ai/evaluation/_evaluators/_document_retrieval/_document_retrieval.py +442 -0
- azure/ai/evaluation/_evaluators/_eci/_eci.py +6 -3
- azure/ai/evaluation/_evaluators/_f1_score/_f1_score.py +97 -70
- azure/ai/evaluation/_evaluators/_fluency/_fluency.py +39 -3
- azure/ai/evaluation/_evaluators/_gleu/_gleu.py +80 -25
- azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +230 -20
- azure/ai/evaluation/_evaluators/_groundedness/groundedness_with_query.prompty +30 -29
- azure/ai/evaluation/_evaluators/_groundedness/groundedness_without_query.prompty +19 -14
- azure/ai/evaluation/_evaluators/_intent_resolution/__init__.py +7 -0
- azure/ai/evaluation/_evaluators/_intent_resolution/_intent_resolution.py +196 -0
- azure/ai/evaluation/_evaluators/_intent_resolution/intent_resolution.prompty +275 -0
- azure/ai/evaluation/_evaluators/_meteor/_meteor.py +89 -36
- azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py +22 -4
- azure/ai/evaluation/_evaluators/_qa/_qa.py +94 -35
- azure/ai/evaluation/_evaluators/_relevance/_relevance.py +100 -4
- azure/ai/evaluation/_evaluators/_relevance/relevance.prompty +154 -56
- azure/ai/evaluation/_evaluators/_response_completeness/__init__.py +7 -0
- azure/ai/evaluation/_evaluators/_response_completeness/_response_completeness.py +202 -0
- azure/ai/evaluation/_evaluators/_response_completeness/response_completeness.prompty +84 -0
- azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py +39 -3
- azure/ai/evaluation/_evaluators/_rouge/_rouge.py +166 -26
- azure/ai/evaluation/_evaluators/_service_groundedness/_service_groundedness.py +38 -7
- azure/ai/evaluation/_evaluators/_similarity/_similarity.py +81 -85
- azure/ai/evaluation/_evaluators/_task_adherence/__init__.py +7 -0
- azure/ai/evaluation/_evaluators/_task_adherence/_task_adherence.py +226 -0
- azure/ai/evaluation/_evaluators/_task_adherence/task_adherence.prompty +101 -0
- azure/ai/evaluation/_evaluators/_task_completion/__init__.py +7 -0
- azure/ai/evaluation/_evaluators/_task_completion/_task_completion.py +177 -0
- azure/ai/evaluation/_evaluators/_task_completion/task_completion.prompty +220 -0
- azure/ai/evaluation/_evaluators/_task_navigation_efficiency/__init__.py +7 -0
- azure/ai/evaluation/_evaluators/_task_navigation_efficiency/_task_navigation_efficiency.py +384 -0
- azure/ai/evaluation/_evaluators/_tool_call_accuracy/__init__.py +9 -0
- azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py +298 -0
- azure/ai/evaluation/_evaluators/_tool_call_accuracy/tool_call_accuracy.prompty +166 -0
- azure/ai/evaluation/_evaluators/_tool_input_accuracy/__init__.py +9 -0
- azure/ai/evaluation/_evaluators/_tool_input_accuracy/_tool_input_accuracy.py +263 -0
- azure/ai/evaluation/_evaluators/_tool_input_accuracy/tool_input_accuracy.prompty +76 -0
- azure/ai/evaluation/_evaluators/_tool_output_utilization/__init__.py +7 -0
- azure/ai/evaluation/_evaluators/_tool_output_utilization/_tool_output_utilization.py +225 -0
- azure/ai/evaluation/_evaluators/_tool_output_utilization/tool_output_utilization.prompty +221 -0
- azure/ai/evaluation/_evaluators/_tool_selection/__init__.py +9 -0
- azure/ai/evaluation/_evaluators/_tool_selection/_tool_selection.py +266 -0
- azure/ai/evaluation/_evaluators/_tool_selection/tool_selection.prompty +104 -0
- azure/ai/evaluation/_evaluators/_tool_success/__init__.py +7 -0
- azure/ai/evaluation/_evaluators/_tool_success/_tool_success.py +301 -0
- azure/ai/evaluation/_evaluators/_tool_success/tool_success.prompty +321 -0
- azure/ai/evaluation/_evaluators/_ungrounded_attributes/__init__.py +5 -0
- azure/ai/evaluation/_evaluators/_ungrounded_attributes/_ungrounded_attributes.py +102 -0
- azure/ai/evaluation/_evaluators/_xpia/xpia.py +20 -4
- azure/ai/evaluation/_exceptions.py +24 -1
- azure/ai/evaluation/_http_utils.py +7 -5
- azure/ai/evaluation/_legacy/__init__.py +3 -0
- azure/ai/evaluation/_legacy/_adapters/__init__.py +7 -0
- azure/ai/evaluation/_legacy/_adapters/_check.py +17 -0
- azure/ai/evaluation/_legacy/_adapters/_configuration.py +45 -0
- azure/ai/evaluation/_legacy/_adapters/_constants.py +10 -0
- azure/ai/evaluation/_legacy/_adapters/_errors.py +29 -0
- azure/ai/evaluation/_legacy/_adapters/_flows.py +28 -0
- azure/ai/evaluation/_legacy/_adapters/_service.py +16 -0
- azure/ai/evaluation/_legacy/_adapters/client.py +51 -0
- azure/ai/evaluation/_legacy/_adapters/entities.py +26 -0
- azure/ai/evaluation/_legacy/_adapters/tracing.py +28 -0
- azure/ai/evaluation/_legacy/_adapters/types.py +15 -0
- azure/ai/evaluation/_legacy/_adapters/utils.py +31 -0
- azure/ai/evaluation/_legacy/_batch_engine/__init__.py +9 -0
- azure/ai/evaluation/_legacy/_batch_engine/_config.py +48 -0
- azure/ai/evaluation/_legacy/_batch_engine/_engine.py +477 -0
- azure/ai/evaluation/_legacy/_batch_engine/_exceptions.py +88 -0
- azure/ai/evaluation/_legacy/_batch_engine/_openai_injector.py +132 -0
- azure/ai/evaluation/_legacy/_batch_engine/_result.py +107 -0
- azure/ai/evaluation/_legacy/_batch_engine/_run.py +127 -0
- azure/ai/evaluation/_legacy/_batch_engine/_run_storage.py +128 -0
- azure/ai/evaluation/_legacy/_batch_engine/_run_submitter.py +262 -0
- azure/ai/evaluation/_legacy/_batch_engine/_status.py +25 -0
- azure/ai/evaluation/_legacy/_batch_engine/_trace.py +97 -0
- azure/ai/evaluation/_legacy/_batch_engine/_utils.py +97 -0
- azure/ai/evaluation/_legacy/_batch_engine/_utils_deprecated.py +131 -0
- azure/ai/evaluation/_legacy/_common/__init__.py +3 -0
- azure/ai/evaluation/_legacy/_common/_async_token_provider.py +117 -0
- azure/ai/evaluation/_legacy/_common/_logging.py +292 -0
- azure/ai/evaluation/_legacy/_common/_thread_pool_executor_with_context.py +17 -0
- azure/ai/evaluation/_legacy/prompty/__init__.py +36 -0
- azure/ai/evaluation/_legacy/prompty/_connection.py +119 -0
- azure/ai/evaluation/_legacy/prompty/_exceptions.py +139 -0
- azure/ai/evaluation/_legacy/prompty/_prompty.py +430 -0
- azure/ai/evaluation/_legacy/prompty/_utils.py +663 -0
- azure/ai/evaluation/_legacy/prompty/_yaml_utils.py +99 -0
- azure/ai/evaluation/_model_configurations.py +26 -0
- azure/ai/evaluation/_safety_evaluation/__init__.py +3 -0
- azure/ai/evaluation/_safety_evaluation/_generated_rai_client.py +0 -0
- azure/ai/evaluation/_safety_evaluation/_safety_evaluation.py +917 -0
- azure/ai/evaluation/_user_agent.py +32 -1
- azure/ai/evaluation/_vendor/rouge_score/rouge_scorer.py +0 -4
- azure/ai/evaluation/_vendor/rouge_score/scoring.py +0 -4
- azure/ai/evaluation/_vendor/rouge_score/tokenize.py +0 -4
- azure/ai/evaluation/_version.py +2 -1
- azure/ai/evaluation/red_team/__init__.py +22 -0
- azure/ai/evaluation/red_team/_agent/__init__.py +3 -0
- azure/ai/evaluation/red_team/_agent/_agent_functions.py +261 -0
- azure/ai/evaluation/red_team/_agent/_agent_tools.py +461 -0
- azure/ai/evaluation/red_team/_agent/_agent_utils.py +89 -0
- azure/ai/evaluation/red_team/_agent/_semantic_kernel_plugin.py +228 -0
- azure/ai/evaluation/red_team/_attack_objective_generator.py +268 -0
- azure/ai/evaluation/red_team/_attack_strategy.py +49 -0
- azure/ai/evaluation/red_team/_callback_chat_target.py +115 -0
- azure/ai/evaluation/red_team/_default_converter.py +21 -0
- azure/ai/evaluation/red_team/_evaluation_processor.py +505 -0
- azure/ai/evaluation/red_team/_mlflow_integration.py +430 -0
- azure/ai/evaluation/red_team/_orchestrator_manager.py +803 -0
- azure/ai/evaluation/red_team/_red_team.py +1717 -0
- azure/ai/evaluation/red_team/_red_team_result.py +661 -0
- azure/ai/evaluation/red_team/_result_processor.py +1708 -0
- azure/ai/evaluation/red_team/_utils/__init__.py +37 -0
- azure/ai/evaluation/red_team/_utils/_rai_service_eval_chat_target.py +128 -0
- azure/ai/evaluation/red_team/_utils/_rai_service_target.py +601 -0
- azure/ai/evaluation/red_team/_utils/_rai_service_true_false_scorer.py +114 -0
- azure/ai/evaluation/red_team/_utils/constants.py +72 -0
- azure/ai/evaluation/red_team/_utils/exception_utils.py +345 -0
- azure/ai/evaluation/red_team/_utils/file_utils.py +266 -0
- azure/ai/evaluation/red_team/_utils/formatting_utils.py +365 -0
- azure/ai/evaluation/red_team/_utils/logging_utils.py +139 -0
- azure/ai/evaluation/red_team/_utils/metric_mapping.py +73 -0
- azure/ai/evaluation/red_team/_utils/objective_utils.py +46 -0
- azure/ai/evaluation/red_team/_utils/progress_utils.py +252 -0
- azure/ai/evaluation/red_team/_utils/retry_utils.py +218 -0
- azure/ai/evaluation/red_team/_utils/strategy_utils.py +218 -0
- azure/ai/evaluation/simulator/_adversarial_scenario.py +6 -0
- azure/ai/evaluation/simulator/_adversarial_simulator.py +187 -80
- azure/ai/evaluation/simulator/_constants.py +1 -0
- azure/ai/evaluation/simulator/_conversation/__init__.py +138 -11
- azure/ai/evaluation/simulator/_conversation/_conversation.py +6 -2
- azure/ai/evaluation/simulator/_conversation/constants.py +1 -1
- azure/ai/evaluation/simulator/_direct_attack_simulator.py +37 -24
- azure/ai/evaluation/simulator/_helpers/_language_suffix_mapping.py +1 -0
- azure/ai/evaluation/simulator/_indirect_attack_simulator.py +56 -28
- azure/ai/evaluation/simulator/_model_tools/__init__.py +2 -1
- azure/ai/evaluation/simulator/_model_tools/_generated_rai_client.py +225 -0
- azure/ai/evaluation/simulator/_model_tools/_identity_manager.py +12 -10
- azure/ai/evaluation/simulator/_model_tools/_proxy_completion_model.py +100 -45
- azure/ai/evaluation/simulator/_model_tools/_rai_client.py +101 -3
- azure/ai/evaluation/simulator/_model_tools/_template_handler.py +31 -11
- azure/ai/evaluation/simulator/_model_tools/models.py +20 -17
- azure/ai/evaluation/simulator/_simulator.py +43 -19
- {azure_ai_evaluation-1.0.1.dist-info → azure_ai_evaluation-1.13.3.dist-info}/METADATA +366 -27
- azure_ai_evaluation-1.13.3.dist-info/RECORD +305 -0
- {azure_ai_evaluation-1.0.1.dist-info → azure_ai_evaluation-1.13.3.dist-info}/WHEEL +1 -1
- azure/ai/evaluation/_evaluators/_multimodal/__init__.py +0 -20
- azure/ai/evaluation/_evaluators/_multimodal/_content_safety_multimodal.py +0 -132
- azure/ai/evaluation/_evaluators/_multimodal/_content_safety_multimodal_base.py +0 -55
- azure/ai/evaluation/_evaluators/_multimodal/_hate_unfairness.py +0 -100
- azure/ai/evaluation/_evaluators/_multimodal/_protected_material.py +0 -124
- azure/ai/evaluation/_evaluators/_multimodal/_self_harm.py +0 -100
- azure/ai/evaluation/_evaluators/_multimodal/_sexual.py +0 -100
- azure/ai/evaluation/_evaluators/_multimodal/_violence.py +0 -100
- azure/ai/evaluation/simulator/_tracing.py +0 -89
- azure_ai_evaluation-1.0.1.dist-info/RECORD +0 -119
- {azure_ai_evaluation-1.0.1.dist-info → azure_ai_evaluation-1.13.3.dist-info/licenses}/NOTICE.txt +0 -0
- {azure_ai_evaluation-1.0.1.dist-info → azure_ai_evaluation-1.13.3.dist-info}/top_level.txt +0 -0
|
@@ -23,6 +23,13 @@ class FluencyEvaluator(PromptyEvaluatorBase[Union[str, float]]):
|
|
|
23
23
|
:param model_config: Configuration for the Azure OpenAI model.
|
|
24
24
|
:type model_config: Union[~azure.ai.evaluation.AzureOpenAIModelConfiguration,
|
|
25
25
|
~azure.ai.evaluation.OpenAIModelConfiguration]
|
|
26
|
+
:param threshold: The threshold for the fluency evaluator. Default is 3.
|
|
27
|
+
:type threshold: int
|
|
28
|
+
:param credential: The credential for authenticating to Azure AI service.
|
|
29
|
+
:type credential: ~azure.core.credentials.TokenCredential
|
|
30
|
+
:keyword is_reasoning_model: If True, the evaluator will use reasoning model configuration (o1/o3 models).
|
|
31
|
+
This will adjust parameters like max_completion_tokens and remove unsupported parameters. Default is False.
|
|
32
|
+
:paramtype is_reasoning_model: bool
|
|
26
33
|
|
|
27
34
|
.. admonition:: Example:
|
|
28
35
|
|
|
@@ -33,6 +40,25 @@ class FluencyEvaluator(PromptyEvaluatorBase[Union[str, float]]):
|
|
|
33
40
|
:dedent: 8
|
|
34
41
|
:caption: Initialize and call a FluencyEvaluator.
|
|
35
42
|
|
|
43
|
+
.. admonition:: Example with Threshold:
|
|
44
|
+
|
|
45
|
+
.. literalinclude:: ../samples/evaluation_samples_threshold.py
|
|
46
|
+
:start-after: [START threshold_fluency_evaluator]
|
|
47
|
+
:end-before: [END threshold_fluency_evaluator]
|
|
48
|
+
:language: python
|
|
49
|
+
:dedent: 8
|
|
50
|
+
:caption: Initialize with threshold and call a FluencyEvaluator.
|
|
51
|
+
|
|
52
|
+
.. admonition:: Example using Azure AI Project URL:
|
|
53
|
+
|
|
54
|
+
.. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
|
|
55
|
+
:start-after: [START fluency_evaluator]
|
|
56
|
+
:end-before: [END fluency_evaluator]
|
|
57
|
+
:language: python
|
|
58
|
+
:dedent: 8
|
|
59
|
+
:caption: Initialize and call FluencyEvaluator using Azure AI Project URL in the following format
|
|
60
|
+
https://{resource_name}.services.ai.azure.com/api/projects/{project_name}
|
|
61
|
+
|
|
36
62
|
.. note::
|
|
37
63
|
|
|
38
64
|
To align with our support of a diverse set of models, an output key without the `gpt_` prefix has been added.
|
|
@@ -43,14 +69,24 @@ class FluencyEvaluator(PromptyEvaluatorBase[Union[str, float]]):
|
|
|
43
69
|
_PROMPTY_FILE = "fluency.prompty"
|
|
44
70
|
_RESULT_KEY = "fluency"
|
|
45
71
|
|
|
46
|
-
id = "
|
|
72
|
+
id = "azureai://built-in/evaluators/fluency"
|
|
47
73
|
"""Evaluator identifier, experimental and to be used only with evaluation in cloud."""
|
|
48
74
|
|
|
49
75
|
@override
|
|
50
|
-
def __init__(self, model_config):
|
|
76
|
+
def __init__(self, model_config, *, credential=None, threshold=3, **kwargs):
|
|
51
77
|
current_dir = os.path.dirname(__file__)
|
|
52
78
|
prompty_path = os.path.join(current_dir, self._PROMPTY_FILE)
|
|
53
|
-
|
|
79
|
+
self._threshold = threshold
|
|
80
|
+
self._higher_is_better = True
|
|
81
|
+
super().__init__(
|
|
82
|
+
model_config=model_config,
|
|
83
|
+
prompty_file=prompty_path,
|
|
84
|
+
result_key=self._RESULT_KEY,
|
|
85
|
+
threshold=threshold,
|
|
86
|
+
credential=credential,
|
|
87
|
+
_higher_is_better=self._higher_is_better,
|
|
88
|
+
**kwargs,
|
|
89
|
+
)
|
|
54
90
|
|
|
55
91
|
@overload
|
|
56
92
|
def __call__(
|
|
@@ -1,28 +1,17 @@
|
|
|
1
1
|
# ---------------------------------------------------------
|
|
2
2
|
# Copyright (c) Microsoft Corporation. All rights reserved.
|
|
3
3
|
# ---------------------------------------------------------
|
|
4
|
+
from typing import Dict
|
|
4
5
|
from nltk.translate.gleu_score import sentence_gleu
|
|
5
|
-
from
|
|
6
|
+
from typing_extensions import overload, override
|
|
6
7
|
|
|
7
8
|
from azure.ai.evaluation._common.utils import nltk_tokenize
|
|
8
9
|
|
|
10
|
+
from azure.ai.evaluation._evaluators._common import EvaluatorBase
|
|
11
|
+
from azure.ai.evaluation._constants import EVALUATION_PASS_FAIL_MAPPING
|
|
9
12
|
|
|
10
|
-
class _AsyncGleuScoreEvaluator:
|
|
11
|
-
def __init__(self):
|
|
12
|
-
pass
|
|
13
13
|
|
|
14
|
-
|
|
15
|
-
reference_tokens = nltk_tokenize(ground_truth)
|
|
16
|
-
hypothesis_tokens = nltk_tokenize(response)
|
|
17
|
-
|
|
18
|
-
score = sentence_gleu([reference_tokens], hypothesis_tokens)
|
|
19
|
-
|
|
20
|
-
return {
|
|
21
|
-
"gleu_score": score,
|
|
22
|
-
}
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
class GleuScoreEvaluator:
|
|
14
|
+
class GleuScoreEvaluator(EvaluatorBase):
|
|
26
15
|
"""
|
|
27
16
|
Calculates the GLEU (Google-BLEU) score between a response and the ground truth.
|
|
28
17
|
|
|
@@ -34,6 +23,9 @@ class GleuScoreEvaluator:
|
|
|
34
23
|
GLEU scores range from 0 to 1, where a value of 1 represents perfect overlap between the response and
|
|
35
24
|
the ground truth and a value of 0 indicates no overlap.
|
|
36
25
|
|
|
26
|
+
:param threshold: The threshold for the GLEU evaluator. Default is 0.5.
|
|
27
|
+
:type threshold: float
|
|
28
|
+
|
|
37
29
|
.. admonition:: Example:
|
|
38
30
|
|
|
39
31
|
.. literalinclude:: ../samples/evaluation_samples_evaluate.py
|
|
@@ -42,15 +34,66 @@ class GleuScoreEvaluator:
|
|
|
42
34
|
:language: python
|
|
43
35
|
:dedent: 8
|
|
44
36
|
:caption: Initialize and call a GleuScoreEvaluator.
|
|
37
|
+
|
|
38
|
+
.. admonition:: Example with Threshold:
|
|
39
|
+
|
|
40
|
+
.. literalinclude:: ../samples/evaluation_samples_threshold.py
|
|
41
|
+
:start-after: [START threshold_gleu_score_evaluator]
|
|
42
|
+
:end-before: [END threshold_gleu_score_evaluator]
|
|
43
|
+
:language: python
|
|
44
|
+
:dedent: 8
|
|
45
|
+
:caption: Initialize with threshold and call a GleuScoreEvaluator.
|
|
46
|
+
|
|
47
|
+
.. admonition:: Example using Azure AI Project URL:
|
|
48
|
+
|
|
49
|
+
.. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
|
|
50
|
+
:start-after: [START gleu_score_evaluator]
|
|
51
|
+
:end-before: [END gleu_score_evaluator]
|
|
52
|
+
:language: python
|
|
53
|
+
:dedent: 8
|
|
54
|
+
:caption: Initialize and call GleuScoreEvaluator using Azure AI Project URL in the following format
|
|
55
|
+
https://{resource_name}.services.ai.azure.com/api/projects/{project_name}
|
|
45
56
|
"""
|
|
46
57
|
|
|
47
|
-
id = "
|
|
58
|
+
id = "azureai://built-in/evaluators/gleu_score"
|
|
48
59
|
"""Evaluator identifier, experimental and to be used only with evaluation in cloud."""
|
|
49
60
|
|
|
50
|
-
|
|
51
|
-
|
|
61
|
+
@override
|
|
62
|
+
def __init__(self, *, threshold=0.5):
|
|
63
|
+
self._threshold = threshold
|
|
64
|
+
self._higher_is_better = True
|
|
65
|
+
super().__init__(threshold=threshold, _higher_is_better=self._higher_is_better)
|
|
52
66
|
|
|
53
|
-
|
|
67
|
+
@override
|
|
68
|
+
async def _do_eval(self, eval_input: Dict) -> Dict[str, float]:
|
|
69
|
+
"""Produce a glue score evaluation result.
|
|
70
|
+
|
|
71
|
+
:param eval_input: The input to the evaluation function.
|
|
72
|
+
:type eval_input: Dict
|
|
73
|
+
:return: The evaluation result.
|
|
74
|
+
:rtype: Dict
|
|
75
|
+
"""
|
|
76
|
+
ground_truth = eval_input["ground_truth"]
|
|
77
|
+
response = eval_input["response"]
|
|
78
|
+
reference_tokens = nltk_tokenize(ground_truth)
|
|
79
|
+
hypothesis_tokens = nltk_tokenize(response)
|
|
80
|
+
|
|
81
|
+
score = sentence_gleu([reference_tokens], hypothesis_tokens)
|
|
82
|
+
binary_result = False
|
|
83
|
+
if self._higher_is_better:
|
|
84
|
+
if score >= self._threshold:
|
|
85
|
+
binary_result = True
|
|
86
|
+
else:
|
|
87
|
+
if score <= self._threshold:
|
|
88
|
+
binary_result = True
|
|
89
|
+
return {
|
|
90
|
+
"gleu_score": score,
|
|
91
|
+
"gleu_result": EVALUATION_PASS_FAIL_MAPPING[binary_result],
|
|
92
|
+
"gleu_threshold": self._threshold,
|
|
93
|
+
}
|
|
94
|
+
|
|
95
|
+
@overload # type: ignore
|
|
96
|
+
def __call__(self, *, ground_truth: str, response: str):
|
|
54
97
|
"""
|
|
55
98
|
Evaluate the GLEU score between the response and the ground truth.
|
|
56
99
|
|
|
@@ -61,9 +104,21 @@ class GleuScoreEvaluator:
|
|
|
61
104
|
:return: The GLEU score.
|
|
62
105
|
:rtype: Dict[str, float]
|
|
63
106
|
"""
|
|
64
|
-
return async_run_allowing_running_loop(
|
|
65
|
-
self._async_evaluator, ground_truth=ground_truth, response=response, **kwargs
|
|
66
|
-
)
|
|
67
107
|
|
|
68
|
-
|
|
69
|
-
|
|
108
|
+
@override
|
|
109
|
+
def __call__( # pylint: disable=docstring-missing-param
|
|
110
|
+
self,
|
|
111
|
+
*args,
|
|
112
|
+
**kwargs,
|
|
113
|
+
):
|
|
114
|
+
"""
|
|
115
|
+
Evaluate the GLEU score between the response and the ground truth.
|
|
116
|
+
|
|
117
|
+
:keyword response: The response to be evaluated.
|
|
118
|
+
:paramtype response: str
|
|
119
|
+
:keyword ground_truth: The ground truth to be compared against.
|
|
120
|
+
:paramtype ground_truth: str
|
|
121
|
+
:return: The GLEU score.
|
|
122
|
+
:rtype: Dict[str, float]
|
|
123
|
+
"""
|
|
124
|
+
return super().__call__(*args, **kwargs)
|
|
@@ -1,25 +1,39 @@
|
|
|
1
1
|
# ---------------------------------------------------------
|
|
2
2
|
# Copyright (c) Microsoft Corporation. All rights reserved.
|
|
3
3
|
# ---------------------------------------------------------
|
|
4
|
-
import os
|
|
5
|
-
from typing import Dict, List, Optional, Union
|
|
4
|
+
import os, logging
|
|
5
|
+
from typing import Dict, List, Optional, Union, Any, Tuple
|
|
6
6
|
|
|
7
7
|
from typing_extensions import overload, override
|
|
8
|
-
from
|
|
8
|
+
from azure.ai.evaluation._legacy.prompty import AsyncPrompty
|
|
9
9
|
|
|
10
10
|
from azure.ai.evaluation._evaluators._common import PromptyEvaluatorBase
|
|
11
11
|
from azure.ai.evaluation._model_configurations import Conversation
|
|
12
|
-
from ..._common.utils import
|
|
12
|
+
from ..._common.utils import (
|
|
13
|
+
ErrorBlame,
|
|
14
|
+
ErrorTarget,
|
|
15
|
+
EvaluationException,
|
|
16
|
+
ErrorCategory,
|
|
17
|
+
construct_prompty_model_config,
|
|
18
|
+
validate_model_config,
|
|
19
|
+
simplify_messages,
|
|
20
|
+
)
|
|
13
21
|
|
|
14
22
|
try:
|
|
15
|
-
from ..._user_agent import
|
|
23
|
+
from ..._user_agent import UserAgentSingleton
|
|
16
24
|
except ImportError:
|
|
17
|
-
|
|
25
|
+
|
|
26
|
+
class UserAgentSingleton:
|
|
27
|
+
@property
|
|
28
|
+
def value(self) -> str:
|
|
29
|
+
return "None"
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
logger = logging.getLogger(__name__)
|
|
18
33
|
|
|
19
34
|
|
|
20
35
|
class GroundednessEvaluator(PromptyEvaluatorBase[Union[str, float]]):
|
|
21
|
-
"""
|
|
22
|
-
Evaluates groundedness score for a given query (optional), response, and context or a multi-turn conversation,
|
|
36
|
+
"""Evaluates groundedness score for a given query (optional), response, and context or a multi-turn conversation,
|
|
23
37
|
including reasoning.
|
|
24
38
|
|
|
25
39
|
The groundedness measure assesses the correspondence between claims in an AI-generated answer and the source
|
|
@@ -33,6 +47,13 @@ class GroundednessEvaluator(PromptyEvaluatorBase[Union[str, float]]):
|
|
|
33
47
|
:param model_config: Configuration for the Azure OpenAI model.
|
|
34
48
|
:type model_config: Union[~azure.ai.evaluation.AzureOpenAIModelConfiguration,
|
|
35
49
|
~azure.ai.evaluation.OpenAIModelConfiguration]
|
|
50
|
+
:param threshold: The threshold for the groundedness evaluator. Default is 3.
|
|
51
|
+
:type threshold: int
|
|
52
|
+
:param credential: The credential for authenticating to Azure AI service.
|
|
53
|
+
:type credential: ~azure.core.credentials.TokenCredential
|
|
54
|
+
:keyword is_reasoning_model: If True, the evaluator will use reasoning model configuration (o1/o3 models).
|
|
55
|
+
This will adjust parameters like max_completion_tokens and remove unsupported parameters. Default is False.
|
|
56
|
+
:paramtype is_reasoning_model: bool
|
|
36
57
|
|
|
37
58
|
.. admonition:: Example:
|
|
38
59
|
|
|
@@ -43,6 +64,25 @@ class GroundednessEvaluator(PromptyEvaluatorBase[Union[str, float]]):
|
|
|
43
64
|
:dedent: 8
|
|
44
65
|
:caption: Initialize and call a GroundednessEvaluator.
|
|
45
66
|
|
|
67
|
+
.. admonition:: Example with Threshold:
|
|
68
|
+
|
|
69
|
+
.. literalinclude:: ../samples/evaluation_samples_threshold.py
|
|
70
|
+
:start-after: [START threshold_groundedness_evaluator]
|
|
71
|
+
:end-before: [END threshold_groundedness_evaluator]
|
|
72
|
+
:language: python
|
|
73
|
+
:dedent: 8
|
|
74
|
+
:caption: Initialize with threshold and call a GroundednessEvaluator.
|
|
75
|
+
|
|
76
|
+
.. admonition:: Example using Azure AI Project URL:
|
|
77
|
+
|
|
78
|
+
.. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
|
|
79
|
+
:start-after: [START groundedness_evaluator]
|
|
80
|
+
:end-before: [END groundedness_evaluator]
|
|
81
|
+
:language: python
|
|
82
|
+
:dedent: 8
|
|
83
|
+
:caption: Initialize and call GroundednessEvaluator using Azure AI Project URL in the following format
|
|
84
|
+
https://{resource_name}.services.ai.azure.com/api/projects/{project_name}
|
|
85
|
+
|
|
46
86
|
.. note::
|
|
47
87
|
|
|
48
88
|
To align with our support of a diverse set of models, an output key without the `gpt_` prefix has been added.
|
|
@@ -54,17 +94,28 @@ class GroundednessEvaluator(PromptyEvaluatorBase[Union[str, float]]):
|
|
|
54
94
|
_PROMPTY_FILE_WITH_QUERY = "groundedness_with_query.prompty"
|
|
55
95
|
_RESULT_KEY = "groundedness"
|
|
56
96
|
_OPTIONAL_PARAMS = ["query"]
|
|
97
|
+
_SUPPORTED_TOOLS = ["file_search"]
|
|
57
98
|
|
|
58
|
-
id = "
|
|
99
|
+
id = "azureai://built-in/evaluators/groundedness"
|
|
59
100
|
"""Evaluator identifier, experimental and to be used only with evaluation in cloud."""
|
|
60
101
|
|
|
61
102
|
@override
|
|
62
|
-
def __init__(self, model_config):
|
|
103
|
+
def __init__(self, model_config, *, threshold=3, credential=None, **kwargs):
|
|
63
104
|
current_dir = os.path.dirname(__file__)
|
|
64
105
|
prompty_path = os.path.join(current_dir, self._PROMPTY_FILE_NO_QUERY) # Default to no query
|
|
65
106
|
|
|
66
|
-
|
|
107
|
+
self._higher_is_better = True
|
|
108
|
+
super().__init__(
|
|
109
|
+
model_config=model_config,
|
|
110
|
+
prompty_file=prompty_path,
|
|
111
|
+
result_key=self._RESULT_KEY,
|
|
112
|
+
threshold=threshold,
|
|
113
|
+
credential=credential,
|
|
114
|
+
_higher_is_better=self._higher_is_better,
|
|
115
|
+
**kwargs,
|
|
116
|
+
)
|
|
67
117
|
self._model_config = model_config
|
|
118
|
+
self.threshold = threshold
|
|
68
119
|
# Needs to be set because it's used in call method to re-validate prompt if `query` is provided
|
|
69
120
|
|
|
70
121
|
@overload
|
|
@@ -88,6 +139,26 @@ class GroundednessEvaluator(PromptyEvaluatorBase[Union[str, float]]):
|
|
|
88
139
|
:rtype: Dict[str, float]
|
|
89
140
|
"""
|
|
90
141
|
|
|
142
|
+
@overload
|
|
143
|
+
def __call__(
|
|
144
|
+
self,
|
|
145
|
+
*,
|
|
146
|
+
query: str,
|
|
147
|
+
response: List[dict],
|
|
148
|
+
tool_definitions: List[dict],
|
|
149
|
+
) -> Dict[str, Union[str, float]]:
|
|
150
|
+
"""Evaluate groundedness for agent response with tool calls. Only file_search tool is supported.
|
|
151
|
+
|
|
152
|
+
:keyword query: The query to be evaluated.
|
|
153
|
+
:paramtype query: str
|
|
154
|
+
:keyword response: The response from the agent to be evaluated.
|
|
155
|
+
:paramtype response: List[dict]
|
|
156
|
+
:keyword tool_definitions: The tool definitions used by the agent.
|
|
157
|
+
:paramtype tool_definitions: List[dict]
|
|
158
|
+
:return: The groundedness score.
|
|
159
|
+
:rtype: Dict[str, Union[str, float]]
|
|
160
|
+
"""
|
|
161
|
+
|
|
91
162
|
@overload
|
|
92
163
|
def __call__(
|
|
93
164
|
self,
|
|
@@ -131,14 +202,153 @@ class GroundednessEvaluator(PromptyEvaluatorBase[Union[str, float]]):
|
|
|
131
202
|
"""
|
|
132
203
|
|
|
133
204
|
if kwargs.get("query", None):
|
|
134
|
-
|
|
135
|
-
prompty_path = os.path.join(current_dir, self._PROMPTY_FILE_WITH_QUERY)
|
|
136
|
-
self._prompty_file = prompty_path
|
|
137
|
-
prompty_model_config = construct_prompty_model_config(
|
|
138
|
-
validate_model_config(self._model_config),
|
|
139
|
-
self._DEFAULT_OPEN_API_VERSION,
|
|
140
|
-
USER_AGENT,
|
|
141
|
-
)
|
|
142
|
-
self._flow = AsyncPrompty.load(source=self._prompty_file, model=prompty_model_config)
|
|
205
|
+
self._ensure_query_prompty_loaded()
|
|
143
206
|
|
|
144
207
|
return super().__call__(*args, **kwargs)
|
|
208
|
+
|
|
209
|
+
def _ensure_query_prompty_loaded(self):
|
|
210
|
+
"""Switch to the query prompty file if not already loaded."""
|
|
211
|
+
|
|
212
|
+
current_dir = os.path.dirname(__file__)
|
|
213
|
+
prompty_path = os.path.join(current_dir, self._PROMPTY_FILE_WITH_QUERY)
|
|
214
|
+
|
|
215
|
+
self._prompty_file = prompty_path
|
|
216
|
+
prompty_model_config = construct_prompty_model_config(
|
|
217
|
+
validate_model_config(self._model_config),
|
|
218
|
+
self._DEFAULT_OPEN_API_VERSION,
|
|
219
|
+
UserAgentSingleton().value,
|
|
220
|
+
)
|
|
221
|
+
self._flow = AsyncPrompty.load(source=self._prompty_file, model=prompty_model_config)
|
|
222
|
+
|
|
223
|
+
def _has_context(self, eval_input: dict) -> bool:
|
|
224
|
+
"""
|
|
225
|
+
Return True if eval_input contains a non-empty 'context' field.
|
|
226
|
+
Treats None, empty strings, empty lists, and lists of empty strings as no context.
|
|
227
|
+
"""
|
|
228
|
+
context = eval_input.get("context", None)
|
|
229
|
+
if not context:
|
|
230
|
+
return False
|
|
231
|
+
if context == "<>": # Special marker for no context
|
|
232
|
+
return False
|
|
233
|
+
if isinstance(context, list):
|
|
234
|
+
return any(str(c).strip() for c in context)
|
|
235
|
+
if isinstance(context, str):
|
|
236
|
+
return bool(context.strip())
|
|
237
|
+
return True
|
|
238
|
+
|
|
239
|
+
@override
|
|
240
|
+
async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]:
|
|
241
|
+
if eval_input.get("query", None) is None:
|
|
242
|
+
return await super()._do_eval(eval_input)
|
|
243
|
+
|
|
244
|
+
contains_context = self._has_context(eval_input)
|
|
245
|
+
|
|
246
|
+
simplified_query = simplify_messages(eval_input["query"], drop_tool_calls=contains_context)
|
|
247
|
+
simplified_response = simplify_messages(eval_input["response"], drop_tool_calls=False)
|
|
248
|
+
|
|
249
|
+
# Build simplified input
|
|
250
|
+
simplified_eval_input = {
|
|
251
|
+
"query": simplified_query,
|
|
252
|
+
"response": simplified_response,
|
|
253
|
+
"context": eval_input["context"],
|
|
254
|
+
}
|
|
255
|
+
|
|
256
|
+
# Replace and call the parent method
|
|
257
|
+
return await super()._do_eval(simplified_eval_input)
|
|
258
|
+
|
|
259
|
+
async def _real_call(self, **kwargs):
|
|
260
|
+
"""The asynchronous call where real end-to-end evaluation logic is performed.
|
|
261
|
+
|
|
262
|
+
:keyword kwargs: The inputs to evaluate.
|
|
263
|
+
:type kwargs: Dict
|
|
264
|
+
:return: The evaluation result.
|
|
265
|
+
:rtype: Union[DoEvalResult[T_EvalValue], AggregateResult[T_EvalValue]]
|
|
266
|
+
"""
|
|
267
|
+
# Convert inputs into list of evaluable inputs.
|
|
268
|
+
try:
|
|
269
|
+
return await super()._real_call(**kwargs)
|
|
270
|
+
except EvaluationException as ex:
|
|
271
|
+
if ex.category == ErrorCategory.NOT_APPLICABLE:
|
|
272
|
+
return {
|
|
273
|
+
self._result_key: self._NOT_APPLICABLE_RESULT,
|
|
274
|
+
f"{self._result_key}_result": "pass",
|
|
275
|
+
f"{self._result_key}_threshold": self.threshold,
|
|
276
|
+
f"{self._result_key}_reason": f"Supported tools were not called. Supported tools for groundedness are {self._SUPPORTED_TOOLS}.",
|
|
277
|
+
}
|
|
278
|
+
else:
|
|
279
|
+
raise ex
|
|
280
|
+
|
|
281
|
+
def _convert_kwargs_to_eval_input(self, **kwargs):
|
|
282
|
+
if kwargs.get("context") or kwargs.get("conversation"):
|
|
283
|
+
return super()._convert_kwargs_to_eval_input(**kwargs)
|
|
284
|
+
query = kwargs.get("query")
|
|
285
|
+
response = kwargs.get("response")
|
|
286
|
+
tool_definitions = kwargs.get("tool_definitions")
|
|
287
|
+
|
|
288
|
+
if query and self._prompty_file != self._PROMPTY_FILE_WITH_QUERY:
|
|
289
|
+
self._ensure_query_prompty_loaded()
|
|
290
|
+
|
|
291
|
+
if (not query) or (not response): # or not tool_definitions:
|
|
292
|
+
msg = f"{type(self).__name__}: Either 'conversation' or individual inputs must be provided. For Agent groundedness 'query' and 'response' are required."
|
|
293
|
+
raise EvaluationException(
|
|
294
|
+
message=msg,
|
|
295
|
+
blame=ErrorBlame.USER_ERROR,
|
|
296
|
+
category=ErrorCategory.INVALID_VALUE,
|
|
297
|
+
target=ErrorTarget.GROUNDEDNESS_EVALUATOR,
|
|
298
|
+
)
|
|
299
|
+
context = self._get_context_from_agent_response(response, tool_definitions)
|
|
300
|
+
|
|
301
|
+
filtered_response = self._filter_file_search_results(response)
|
|
302
|
+
return super()._convert_kwargs_to_eval_input(response=filtered_response, context=context, query=query)
|
|
303
|
+
|
|
304
|
+
def _filter_file_search_results(self, messages: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
|
|
305
|
+
"""Filter out file_search tool results from the messages."""
|
|
306
|
+
file_search_ids = self._get_file_search_tool_call_ids(messages)
|
|
307
|
+
return [
|
|
308
|
+
msg for msg in messages if not (msg.get("role") == "tool" and msg.get("tool_call_id") in file_search_ids)
|
|
309
|
+
]
|
|
310
|
+
|
|
311
|
+
def _get_context_from_agent_response(self, response, tool_definitions):
|
|
312
|
+
"""Extract context text from file_search tool results in the agent response."""
|
|
313
|
+
NO_CONTEXT = "<>"
|
|
314
|
+
context = ""
|
|
315
|
+
try:
|
|
316
|
+
logger.debug("Extracting context from response")
|
|
317
|
+
tool_calls = self._parse_tools_from_response(response=response)
|
|
318
|
+
logger.debug(f"Tool Calls parsed successfully: {tool_calls}")
|
|
319
|
+
|
|
320
|
+
if not tool_calls:
|
|
321
|
+
return NO_CONTEXT
|
|
322
|
+
|
|
323
|
+
context_lines = []
|
|
324
|
+
for tool_call in tool_calls:
|
|
325
|
+
if not isinstance(tool_call, dict) or tool_call.get("type") != "tool_call":
|
|
326
|
+
continue
|
|
327
|
+
|
|
328
|
+
tool_name = tool_call.get("name")
|
|
329
|
+
if tool_name != "file_search":
|
|
330
|
+
continue
|
|
331
|
+
|
|
332
|
+
# Extract tool results
|
|
333
|
+
for result in tool_call.get("tool_result", []):
|
|
334
|
+
results = result if isinstance(result, list) else [result]
|
|
335
|
+
for r in results:
|
|
336
|
+
file_name = r.get("file_name", "Unknown file name")
|
|
337
|
+
for content in r.get("content", []):
|
|
338
|
+
text = content.get("text")
|
|
339
|
+
if text:
|
|
340
|
+
context_lines.append(f"{file_name}:\n- {text}---\n\n")
|
|
341
|
+
|
|
342
|
+
context = "\n".join(context_lines) if len(context_lines) > 0 else None
|
|
343
|
+
|
|
344
|
+
except Exception as ex:
|
|
345
|
+
logger.debug(f"Error extracting context from agent response : {str(ex)}")
|
|
346
|
+
context = None
|
|
347
|
+
|
|
348
|
+
context = context if context else NO_CONTEXT
|
|
349
|
+
return context
|
|
350
|
+
|
|
351
|
+
def _get_file_search_tool_call_ids(self, query_or_response):
|
|
352
|
+
"""Return a list of tool_call_ids for file search tool calls."""
|
|
353
|
+
tool_calls = self._parse_tools_from_response(query_or_response)
|
|
354
|
+
return [tc.get("tool_call_id") for tc in tool_calls if tc.get("name") == "file_search"]
|
|
@@ -32,52 +32,53 @@ system:
|
|
|
32
32
|
|
|
33
33
|
user:
|
|
34
34
|
# Definition
|
|
35
|
-
**Groundedness** refers to how well an answer is anchored in the provided context, evaluating its relevance, accuracy, and completeness based exclusively on that context. It assesses the extent to which the answer directly and fully addresses the question without introducing unrelated or incorrect information.
|
|
35
|
+
**Groundedness** refers to how well an answer is anchored in the provided context, evaluating its relevance, accuracy, and completeness based exclusively on that context. It assesses the extent to which the answer directly and fully addresses the question without introducing unrelated or incorrect information.
|
|
36
|
+
|
|
37
|
+
> Context is the source of truth for evaluating the response. If it's empty, rely on the tool results in the response and query.
|
|
38
|
+
> Evaluate the groundedness of the response message, not the chat history.
|
|
36
39
|
|
|
37
40
|
# Ratings
|
|
38
41
|
## [Groundedness: 1] (Completely Unrelated Response)
|
|
39
|
-
**Definition:** An answer that does not relate to the question or the context in any way.
|
|
42
|
+
**Definition:** An answer that does not relate to the question or the context in any way.
|
|
43
|
+
- Does not relate to the question or context at all.
|
|
44
|
+
- Talks about the general topic but does not respond to the query.
|
|
40
45
|
|
|
41
46
|
**Examples:**
|
|
42
47
|
**Context:** The company's annual meeting will be held next Thursday.
|
|
43
48
|
**Query:** When is the company's annual meeting?
|
|
44
49
|
**Response:** I enjoy hiking in the mountains during summer.
|
|
45
50
|
|
|
46
|
-
**Context:** The new policy aims to reduce carbon emissions by 20% over the next five years.
|
|
47
|
-
**Query:** What is the goal of the new policy?
|
|
48
|
-
**Response:** My favorite color is blue.
|
|
49
|
-
|
|
50
|
-
## [Groundedness: 2] (Related Topic but Does Not Respond to the Query)
|
|
51
|
-
**Definition:** An answer that relates to the general topic of the context but does not answer the specific question asked. It may mention concepts from the context but fails to provide a direct or relevant response.
|
|
52
|
-
|
|
53
|
-
**Examples:**
|
|
54
51
|
**Context:** The museum will exhibit modern art pieces from various local artists.
|
|
55
52
|
**Query:** What kind of art will be exhibited at the museum?
|
|
56
53
|
**Response:** Museums are important cultural institutions.
|
|
57
54
|
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
**Response:** Software updates can sometimes fix bugs.
|
|
61
|
-
|
|
62
|
-
## [Groundedness: 3] (Attempts to Respond but Contains Incorrect Information)
|
|
63
|
-
**Definition:** An answer that attempts to respond to the question but includes incorrect information not supported by the context. It may misstate facts, misinterpret the context, or provide erroneous details.
|
|
55
|
+
## [Groundedness: 2] (Attempts to Respond but Contains Incorrect Information)
|
|
56
|
+
**Definition:** An answer that attempts to respond to the question but includes incorrect information not supported by the context. It may misstate facts, misinterpret the context, or provide erroneous details. Even if some points are correct, the presence of inaccuracies makes the response unreliable.
|
|
64
57
|
|
|
65
58
|
**Examples:**
|
|
66
|
-
**Context:** The festival starts on June 5th and features international musicians.
|
|
59
|
+
**Context:** - The festival starts on June 5th and features international musicians.
|
|
67
60
|
**Query:** When does the festival start?
|
|
68
61
|
**Response:** The festival starts on July 5th and features local artists.
|
|
69
62
|
|
|
70
|
-
**Context:**
|
|
71
|
-
**Query:**
|
|
72
|
-
**Response:**
|
|
63
|
+
**Context:** bakery_menu.txt: - Croissant au Beurre — flaky, buttery croissant
|
|
64
|
+
**Query:** [{"role":"user","content":"Are there croissants?"}]
|
|
65
|
+
**Response:** [{"role":"assistant","content":"Yes, Croissant au Beurre is on the menu, served with jam."}]
|
|
66
|
+
|
|
67
|
+
## [Groundedness: 3] (Nothing to be Grounded)
|
|
68
|
+
Definition: An answer that does not provide any information that can be evaluated against the context. This includes responses that are asking for clarification, providing polite fillers, or follow-up questions.
|
|
69
|
+
|
|
70
|
+
**Examples:**
|
|
71
|
+
**Context:**
|
|
72
|
+
**Query:** [{"role":"user","content":"How many eggs are needed for the recipe?"}, {"role":"tool","content":"tool_result": [{"file_name": "recipe.txt", "content": "The recipe requires two eggs and one cup of milk."}]}, {"role":"assistant","content":"You need three eggs for the recipe."}, {"role":"user","content":"Thank you."}]
|
|
73
|
+
**Response:** [{"role":"assistant","content":"You're welcome, anything else I can help with?"}]
|
|
73
74
|
|
|
74
75
|
## [Groundedness: 4] (Partially Correct Response)
|
|
75
76
|
**Definition:** An answer that provides a correct response to the question but is incomplete or lacks specific details mentioned in the context. It captures some of the necessary information but omits key elements needed for a full understanding.
|
|
76
77
|
|
|
77
78
|
**Examples:**
|
|
78
|
-
**Context:** The bookstore offers a 15% discount to students and a 10% discount to senior citizens.
|
|
79
|
-
**Query:** What discount does the bookstore offer to students?
|
|
80
|
-
**Response:**
|
|
79
|
+
**Context:** - store_details.txt: The bookstore offers a 15% discount to students and a 10% discount to senior citizens.
|
|
80
|
+
**Query:** [{"role":"user","content":"What discount does the bookstore offer to students, if any?"}]
|
|
81
|
+
**Response:** [{"role":"assistant","content":"Yes, students get a discount at the bookstore."}]
|
|
81
82
|
|
|
82
83
|
**Context:** The company's headquarters are located in Berlin, Germany.
|
|
83
84
|
**Query:** Where are the company's headquarters?
|
|
@@ -87,13 +88,13 @@ user:
|
|
|
87
88
|
**Definition:** An answer that thoroughly and accurately responds to the question, including all relevant details from the context. It directly addresses the question with precise information, demonstrating complete understanding without adding extraneous information.
|
|
88
89
|
|
|
89
90
|
**Examples:**
|
|
90
|
-
**
|
|
91
|
-
**
|
|
92
|
-
**
|
|
91
|
+
**CONTEXT:** The author released her latest novel, 'The Silent Echo', on September 1st.
|
|
92
|
+
**QUERY:** [{"role":"user","content":"When was 'The Silent Echo' released?"}]
|
|
93
|
+
**RESPONSE:** [{"role":"assistant","content":"The 'Silent Echo' was released on September 1st."}]
|
|
93
94
|
|
|
94
|
-
**Context:**
|
|
95
|
+
**Context:**
|
|
95
96
|
**Query:** By what date must participants register to receive early bird pricing?
|
|
96
|
-
**Response:** Participants must register by May 31st to receive early bird pricing.
|
|
97
|
+
**Response:** [{"role":"tool","content":"tool_result": [{"file_name": "store_guidelines.txt", "content": "Participants registering before and including May 31st will be eligible for early bird pricing."}]}, {"role":"assistant","content":"Participants must register by May 31st to receive early bird pricing."}]
|
|
97
98
|
|
|
98
99
|
|
|
99
100
|
# Data
|
|
@@ -103,7 +104,7 @@ RESPONSE: {{response}}
|
|
|
103
104
|
|
|
104
105
|
|
|
105
106
|
# Tasks
|
|
106
|
-
## Please provide your assessment Score for the previous RESPONSE in relation to the CONTEXT and
|
|
107
|
+
## Please provide your assessment Score for the previous RESPONSE message in relation to the CONTEXT, QUERY and RESPONSE tools based on the Definitions above. Your output should include the following information:
|
|
107
108
|
- **ThoughtChain**: To improve the reasoning process, think step by step and include a step-by-step explanation of your thought process as you analyze the data based on the definitions. Keep it brief and start your ThoughtChain with "Let's think step by step:".
|
|
108
109
|
- **Explanation**: a very short explanation of why you think the input Data should get that Score.
|
|
109
110
|
- **Score**: based on your previous analysis, provide your Score. The Score you give MUST be a integer score (i.e., "1", "2"...) based on the levels of the definitions.
|