azure-ai-evaluation 1.0.0b2__py3-none-any.whl → 1.13.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of azure-ai-evaluation might be problematic. Click here for more details.
- azure/ai/evaluation/__init__.py +100 -5
- azure/ai/evaluation/{_evaluators/_chat → _aoai}/__init__.py +3 -2
- azure/ai/evaluation/_aoai/aoai_grader.py +140 -0
- azure/ai/evaluation/_aoai/label_grader.py +68 -0
- azure/ai/evaluation/_aoai/python_grader.py +86 -0
- azure/ai/evaluation/_aoai/score_model_grader.py +94 -0
- azure/ai/evaluation/_aoai/string_check_grader.py +66 -0
- azure/ai/evaluation/_aoai/text_similarity_grader.py +80 -0
- azure/ai/evaluation/_azure/__init__.py +3 -0
- azure/ai/evaluation/_azure/_clients.py +204 -0
- azure/ai/evaluation/_azure/_envs.py +207 -0
- azure/ai/evaluation/_azure/_models.py +227 -0
- azure/ai/evaluation/_azure/_token_manager.py +129 -0
- azure/ai/evaluation/_common/__init__.py +9 -1
- azure/ai/evaluation/{simulator/_helpers → _common}/_experimental.py +24 -9
- azure/ai/evaluation/_common/constants.py +131 -2
- azure/ai/evaluation/_common/evaluation_onedp_client.py +169 -0
- azure/ai/evaluation/_common/math.py +89 -0
- azure/ai/evaluation/_common/onedp/__init__.py +32 -0
- azure/ai/evaluation/_common/onedp/_client.py +166 -0
- azure/ai/evaluation/_common/onedp/_configuration.py +72 -0
- azure/ai/evaluation/_common/onedp/_model_base.py +1232 -0
- azure/ai/evaluation/_common/onedp/_patch.py +21 -0
- azure/ai/evaluation/_common/onedp/_serialization.py +2032 -0
- azure/ai/evaluation/_common/onedp/_types.py +21 -0
- azure/ai/evaluation/_common/onedp/_utils/__init__.py +6 -0
- azure/ai/evaluation/_common/onedp/_utils/model_base.py +1232 -0
- azure/ai/evaluation/_common/onedp/_utils/serialization.py +2032 -0
- azure/ai/evaluation/_common/onedp/_validation.py +66 -0
- azure/ai/evaluation/_common/onedp/_vendor.py +50 -0
- azure/ai/evaluation/_common/onedp/_version.py +9 -0
- azure/ai/evaluation/_common/onedp/aio/__init__.py +29 -0
- azure/ai/evaluation/_common/onedp/aio/_client.py +168 -0
- azure/ai/evaluation/_common/onedp/aio/_configuration.py +72 -0
- azure/ai/evaluation/_common/onedp/aio/_patch.py +21 -0
- azure/ai/evaluation/_common/onedp/aio/operations/__init__.py +49 -0
- azure/ai/evaluation/_common/onedp/aio/operations/_operations.py +7143 -0
- azure/ai/evaluation/_common/onedp/aio/operations/_patch.py +21 -0
- azure/ai/evaluation/_common/onedp/models/__init__.py +358 -0
- azure/ai/evaluation/_common/onedp/models/_enums.py +447 -0
- azure/ai/evaluation/_common/onedp/models/_models.py +5963 -0
- azure/ai/evaluation/_common/onedp/models/_patch.py +21 -0
- azure/ai/evaluation/_common/onedp/operations/__init__.py +49 -0
- azure/ai/evaluation/_common/onedp/operations/_operations.py +8951 -0
- azure/ai/evaluation/_common/onedp/operations/_patch.py +21 -0
- azure/ai/evaluation/_common/onedp/py.typed +1 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/__init__.py +1 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/aio/__init__.py +1 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/aio/operations/__init__.py +25 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/aio/operations/_operations.py +34 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/aio/operations/_patch.py +20 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/__init__.py +1 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/__init__.py +1 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/operations/__init__.py +22 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/operations/_operations.py +29 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/operations/_patch.py +20 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/operations/__init__.py +22 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/operations/_operations.py +29 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/operations/_patch.py +20 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/operations/__init__.py +25 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/operations/_operations.py +34 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/operations/_patch.py +20 -0
- azure/ai/evaluation/_common/rai_service.py +831 -142
- azure/ai/evaluation/_common/raiclient/__init__.py +34 -0
- azure/ai/evaluation/_common/raiclient/_client.py +128 -0
- azure/ai/evaluation/_common/raiclient/_configuration.py +87 -0
- azure/ai/evaluation/_common/raiclient/_model_base.py +1235 -0
- azure/ai/evaluation/_common/raiclient/_patch.py +20 -0
- azure/ai/evaluation/_common/raiclient/_serialization.py +2050 -0
- azure/ai/evaluation/_common/raiclient/_version.py +9 -0
- azure/ai/evaluation/_common/raiclient/aio/__init__.py +29 -0
- azure/ai/evaluation/_common/raiclient/aio/_client.py +130 -0
- azure/ai/evaluation/_common/raiclient/aio/_configuration.py +87 -0
- azure/ai/evaluation/_common/raiclient/aio/_patch.py +20 -0
- azure/ai/evaluation/_common/raiclient/aio/operations/__init__.py +25 -0
- azure/ai/evaluation/_common/raiclient/aio/operations/_operations.py +981 -0
- azure/ai/evaluation/_common/raiclient/aio/operations/_patch.py +20 -0
- azure/ai/evaluation/_common/raiclient/models/__init__.py +60 -0
- azure/ai/evaluation/_common/raiclient/models/_enums.py +18 -0
- azure/ai/evaluation/_common/raiclient/models/_models.py +651 -0
- azure/ai/evaluation/_common/raiclient/models/_patch.py +20 -0
- azure/ai/evaluation/_common/raiclient/operations/__init__.py +25 -0
- azure/ai/evaluation/_common/raiclient/operations/_operations.py +1238 -0
- azure/ai/evaluation/_common/raiclient/operations/_patch.py +20 -0
- azure/ai/evaluation/_common/raiclient/py.typed +1 -0
- azure/ai/evaluation/_common/utils.py +870 -34
- azure/ai/evaluation/_constants.py +167 -6
- azure/ai/evaluation/_converters/__init__.py +3 -0
- azure/ai/evaluation/_converters/_ai_services.py +899 -0
- azure/ai/evaluation/_converters/_models.py +467 -0
- azure/ai/evaluation/_converters/_sk_services.py +495 -0
- azure/ai/evaluation/_eval_mapping.py +83 -0
- azure/ai/evaluation/_evaluate/_batch_run/__init__.py +17 -0
- azure/ai/evaluation/_evaluate/_batch_run/_run_submitter_client.py +176 -0
- azure/ai/evaluation/_evaluate/_batch_run/batch_clients.py +82 -0
- azure/ai/evaluation/_evaluate/{_batch_run_client → _batch_run}/code_client.py +47 -25
- azure/ai/evaluation/_evaluate/{_batch_run_client/batch_run_context.py → _batch_run/eval_run_context.py} +42 -13
- azure/ai/evaluation/_evaluate/_batch_run/proxy_client.py +124 -0
- azure/ai/evaluation/_evaluate/_batch_run/target_run_context.py +62 -0
- azure/ai/evaluation/_evaluate/_eval_run.py +102 -59
- azure/ai/evaluation/_evaluate/_evaluate.py +2134 -311
- azure/ai/evaluation/_evaluate/_evaluate_aoai.py +992 -0
- azure/ai/evaluation/_evaluate/_telemetry/__init__.py +14 -99
- azure/ai/evaluation/_evaluate/_utils.py +289 -40
- azure/ai/evaluation/_evaluator_definition.py +76 -0
- azure/ai/evaluation/_evaluators/_bleu/_bleu.py +93 -42
- azure/ai/evaluation/_evaluators/_code_vulnerability/__init__.py +5 -0
- azure/ai/evaluation/_evaluators/_code_vulnerability/_code_vulnerability.py +119 -0
- azure/ai/evaluation/_evaluators/_coherence/_coherence.py +117 -91
- azure/ai/evaluation/_evaluators/_coherence/coherence.prompty +76 -39
- azure/ai/evaluation/_evaluators/_common/__init__.py +15 -0
- azure/ai/evaluation/_evaluators/_common/_base_eval.py +742 -0
- azure/ai/evaluation/_evaluators/_common/_base_multi_eval.py +63 -0
- azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +345 -0
- azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py +198 -0
- azure/ai/evaluation/_evaluators/_common/_conversation_aggregators.py +49 -0
- azure/ai/evaluation/_evaluators/_content_safety/__init__.py +0 -4
- azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py +144 -86
- azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py +138 -57
- azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py +123 -55
- azure/ai/evaluation/_evaluators/_content_safety/_sexual.py +133 -54
- azure/ai/evaluation/_evaluators/_content_safety/_violence.py +134 -54
- azure/ai/evaluation/_evaluators/_document_retrieval/__init__.py +7 -0
- azure/ai/evaluation/_evaluators/_document_retrieval/_document_retrieval.py +442 -0
- azure/ai/evaluation/_evaluators/_eci/_eci.py +49 -56
- azure/ai/evaluation/_evaluators/_f1_score/_f1_score.py +102 -60
- azure/ai/evaluation/_evaluators/_fluency/_fluency.py +115 -92
- azure/ai/evaluation/_evaluators/_fluency/fluency.prompty +66 -41
- azure/ai/evaluation/_evaluators/_gleu/_gleu.py +90 -37
- azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +318 -82
- azure/ai/evaluation/_evaluators/_groundedness/groundedness_with_query.prompty +114 -0
- azure/ai/evaluation/_evaluators/_groundedness/groundedness_without_query.prompty +104 -0
- azure/ai/evaluation/{_evaluate/_batch_run_client → _evaluators/_intent_resolution}/__init__.py +3 -4
- azure/ai/evaluation/_evaluators/_intent_resolution/_intent_resolution.py +196 -0
- azure/ai/evaluation/_evaluators/_intent_resolution/intent_resolution.prompty +275 -0
- azure/ai/evaluation/_evaluators/_meteor/_meteor.py +107 -61
- azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py +104 -77
- azure/ai/evaluation/_evaluators/_qa/_qa.py +115 -63
- azure/ai/evaluation/_evaluators/_relevance/_relevance.py +182 -98
- azure/ai/evaluation/_evaluators/_relevance/relevance.prompty +178 -49
- azure/ai/evaluation/_evaluators/_response_completeness/__init__.py +7 -0
- azure/ai/evaluation/_evaluators/_response_completeness/_response_completeness.py +202 -0
- azure/ai/evaluation/_evaluators/_response_completeness/response_completeness.prompty +84 -0
- azure/ai/evaluation/_evaluators/{_chat/retrieval → _retrieval}/__init__.py +2 -2
- azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py +148 -0
- azure/ai/evaluation/_evaluators/_retrieval/retrieval.prompty +93 -0
- azure/ai/evaluation/_evaluators/_rouge/_rouge.py +189 -50
- azure/ai/evaluation/_evaluators/_service_groundedness/__init__.py +9 -0
- azure/ai/evaluation/_evaluators/_service_groundedness/_service_groundedness.py +179 -0
- azure/ai/evaluation/_evaluators/_similarity/_similarity.py +102 -91
- azure/ai/evaluation/_evaluators/_similarity/similarity.prompty +0 -5
- azure/ai/evaluation/_evaluators/_task_adherence/__init__.py +7 -0
- azure/ai/evaluation/_evaluators/_task_adherence/_task_adherence.py +226 -0
- azure/ai/evaluation/_evaluators/_task_adherence/task_adherence.prompty +101 -0
- azure/ai/evaluation/_evaluators/_task_completion/__init__.py +7 -0
- azure/ai/evaluation/_evaluators/_task_completion/_task_completion.py +177 -0
- azure/ai/evaluation/_evaluators/_task_completion/task_completion.prompty +220 -0
- azure/ai/evaluation/_evaluators/_task_navigation_efficiency/__init__.py +7 -0
- azure/ai/evaluation/_evaluators/_task_navigation_efficiency/_task_navigation_efficiency.py +384 -0
- azure/ai/evaluation/_evaluators/_tool_call_accuracy/__init__.py +9 -0
- azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py +298 -0
- azure/ai/evaluation/_evaluators/_tool_call_accuracy/tool_call_accuracy.prompty +166 -0
- azure/ai/evaluation/_evaluators/_tool_input_accuracy/__init__.py +9 -0
- azure/ai/evaluation/_evaluators/_tool_input_accuracy/_tool_input_accuracy.py +263 -0
- azure/ai/evaluation/_evaluators/_tool_input_accuracy/tool_input_accuracy.prompty +76 -0
- azure/ai/evaluation/_evaluators/_tool_output_utilization/__init__.py +7 -0
- azure/ai/evaluation/_evaluators/_tool_output_utilization/_tool_output_utilization.py +225 -0
- azure/ai/evaluation/_evaluators/_tool_output_utilization/tool_output_utilization.prompty +221 -0
- azure/ai/evaluation/_evaluators/_tool_selection/__init__.py +9 -0
- azure/ai/evaluation/_evaluators/_tool_selection/_tool_selection.py +266 -0
- azure/ai/evaluation/_evaluators/_tool_selection/tool_selection.prompty +104 -0
- azure/ai/evaluation/_evaluators/_tool_success/__init__.py +7 -0
- azure/ai/evaluation/_evaluators/_tool_success/_tool_success.py +301 -0
- azure/ai/evaluation/_evaluators/_tool_success/tool_success.prompty +321 -0
- azure/ai/evaluation/_evaluators/_ungrounded_attributes/__init__.py +5 -0
- azure/ai/evaluation/_evaluators/_ungrounded_attributes/_ungrounded_attributes.py +102 -0
- azure/ai/evaluation/_evaluators/_xpia/xpia.py +109 -107
- azure/ai/evaluation/_exceptions.py +51 -7
- azure/ai/evaluation/_http_utils.py +210 -137
- azure/ai/evaluation/_legacy/__init__.py +3 -0
- azure/ai/evaluation/_legacy/_adapters/__init__.py +7 -0
- azure/ai/evaluation/_legacy/_adapters/_check.py +17 -0
- azure/ai/evaluation/_legacy/_adapters/_configuration.py +45 -0
- azure/ai/evaluation/_legacy/_adapters/_constants.py +10 -0
- azure/ai/evaluation/_legacy/_adapters/_errors.py +29 -0
- azure/ai/evaluation/_legacy/_adapters/_flows.py +28 -0
- azure/ai/evaluation/_legacy/_adapters/_service.py +16 -0
- azure/ai/evaluation/_legacy/_adapters/client.py +51 -0
- azure/ai/evaluation/_legacy/_adapters/entities.py +26 -0
- azure/ai/evaluation/_legacy/_adapters/tracing.py +28 -0
- azure/ai/evaluation/_legacy/_adapters/types.py +15 -0
- azure/ai/evaluation/_legacy/_adapters/utils.py +31 -0
- azure/ai/evaluation/_legacy/_batch_engine/__init__.py +9 -0
- azure/ai/evaluation/_legacy/_batch_engine/_config.py +48 -0
- azure/ai/evaluation/_legacy/_batch_engine/_engine.py +477 -0
- azure/ai/evaluation/_legacy/_batch_engine/_exceptions.py +88 -0
- azure/ai/evaluation/_legacy/_batch_engine/_openai_injector.py +132 -0
- azure/ai/evaluation/_legacy/_batch_engine/_result.py +107 -0
- azure/ai/evaluation/_legacy/_batch_engine/_run.py +127 -0
- azure/ai/evaluation/_legacy/_batch_engine/_run_storage.py +128 -0
- azure/ai/evaluation/_legacy/_batch_engine/_run_submitter.py +262 -0
- azure/ai/evaluation/_legacy/_batch_engine/_status.py +25 -0
- azure/ai/evaluation/_legacy/_batch_engine/_trace.py +97 -0
- azure/ai/evaluation/_legacy/_batch_engine/_utils.py +97 -0
- azure/ai/evaluation/_legacy/_batch_engine/_utils_deprecated.py +131 -0
- azure/ai/evaluation/_legacy/_common/__init__.py +3 -0
- azure/ai/evaluation/_legacy/_common/_async_token_provider.py +117 -0
- azure/ai/evaluation/_legacy/_common/_logging.py +292 -0
- azure/ai/evaluation/_legacy/_common/_thread_pool_executor_with_context.py +17 -0
- azure/ai/evaluation/_legacy/prompty/__init__.py +36 -0
- azure/ai/evaluation/_legacy/prompty/_connection.py +119 -0
- azure/ai/evaluation/_legacy/prompty/_exceptions.py +139 -0
- azure/ai/evaluation/_legacy/prompty/_prompty.py +430 -0
- azure/ai/evaluation/_legacy/prompty/_utils.py +663 -0
- azure/ai/evaluation/_legacy/prompty/_yaml_utils.py +99 -0
- azure/ai/evaluation/_model_configurations.py +130 -8
- azure/ai/evaluation/_safety_evaluation/__init__.py +3 -0
- azure/ai/evaluation/_safety_evaluation/_generated_rai_client.py +0 -0
- azure/ai/evaluation/_safety_evaluation/_safety_evaluation.py +917 -0
- azure/ai/evaluation/_user_agent.py +32 -1
- azure/ai/evaluation/_vendor/__init__.py +3 -0
- azure/ai/evaluation/_vendor/rouge_score/__init__.py +14 -0
- azure/ai/evaluation/_vendor/rouge_score/rouge_scorer.py +324 -0
- azure/ai/evaluation/_vendor/rouge_score/scoring.py +59 -0
- azure/ai/evaluation/_vendor/rouge_score/tokenize.py +59 -0
- azure/ai/evaluation/_vendor/rouge_score/tokenizers.py +53 -0
- azure/ai/evaluation/_version.py +2 -1
- azure/ai/evaluation/red_team/__init__.py +22 -0
- azure/ai/evaluation/red_team/_agent/__init__.py +3 -0
- azure/ai/evaluation/red_team/_agent/_agent_functions.py +261 -0
- azure/ai/evaluation/red_team/_agent/_agent_tools.py +461 -0
- azure/ai/evaluation/red_team/_agent/_agent_utils.py +89 -0
- azure/ai/evaluation/red_team/_agent/_semantic_kernel_plugin.py +228 -0
- azure/ai/evaluation/red_team/_attack_objective_generator.py +268 -0
- azure/ai/evaluation/red_team/_attack_strategy.py +49 -0
- azure/ai/evaluation/red_team/_callback_chat_target.py +115 -0
- azure/ai/evaluation/red_team/_default_converter.py +21 -0
- azure/ai/evaluation/red_team/_evaluation_processor.py +505 -0
- azure/ai/evaluation/red_team/_mlflow_integration.py +430 -0
- azure/ai/evaluation/red_team/_orchestrator_manager.py +803 -0
- azure/ai/evaluation/red_team/_red_team.py +1717 -0
- azure/ai/evaluation/red_team/_red_team_result.py +661 -0
- azure/ai/evaluation/red_team/_result_processor.py +1708 -0
- azure/ai/evaluation/red_team/_utils/__init__.py +37 -0
- azure/ai/evaluation/red_team/_utils/_rai_service_eval_chat_target.py +128 -0
- azure/ai/evaluation/red_team/_utils/_rai_service_target.py +601 -0
- azure/ai/evaluation/red_team/_utils/_rai_service_true_false_scorer.py +114 -0
- azure/ai/evaluation/red_team/_utils/constants.py +72 -0
- azure/ai/evaluation/red_team/_utils/exception_utils.py +345 -0
- azure/ai/evaluation/red_team/_utils/file_utils.py +266 -0
- azure/ai/evaluation/red_team/_utils/formatting_utils.py +365 -0
- azure/ai/evaluation/red_team/_utils/logging_utils.py +139 -0
- azure/ai/evaluation/red_team/_utils/metric_mapping.py +73 -0
- azure/ai/evaluation/red_team/_utils/objective_utils.py +46 -0
- azure/ai/evaluation/red_team/_utils/progress_utils.py +252 -0
- azure/ai/evaluation/red_team/_utils/retry_utils.py +218 -0
- azure/ai/evaluation/red_team/_utils/strategy_utils.py +218 -0
- azure/ai/evaluation/simulator/__init__.py +2 -1
- azure/ai/evaluation/simulator/_adversarial_scenario.py +26 -1
- azure/ai/evaluation/simulator/_adversarial_simulator.py +270 -144
- azure/ai/evaluation/simulator/_constants.py +12 -1
- azure/ai/evaluation/simulator/_conversation/__init__.py +151 -23
- azure/ai/evaluation/simulator/_conversation/_conversation.py +10 -6
- azure/ai/evaluation/simulator/_conversation/constants.py +1 -1
- azure/ai/evaluation/simulator/_data_sources/__init__.py +3 -0
- azure/ai/evaluation/simulator/_data_sources/grounding.json +1150 -0
- azure/ai/evaluation/simulator/_direct_attack_simulator.py +54 -75
- azure/ai/evaluation/simulator/_helpers/__init__.py +1 -2
- azure/ai/evaluation/simulator/_helpers/_language_suffix_mapping.py +1 -0
- azure/ai/evaluation/simulator/_helpers/_simulator_data_classes.py +26 -5
- azure/ai/evaluation/simulator/_indirect_attack_simulator.py +145 -104
- azure/ai/evaluation/simulator/_model_tools/__init__.py +2 -1
- azure/ai/evaluation/simulator/_model_tools/_generated_rai_client.py +225 -0
- azure/ai/evaluation/simulator/_model_tools/_identity_manager.py +80 -30
- azure/ai/evaluation/simulator/_model_tools/_proxy_completion_model.py +117 -45
- azure/ai/evaluation/simulator/_model_tools/_rai_client.py +109 -7
- azure/ai/evaluation/simulator/_model_tools/_template_handler.py +97 -33
- azure/ai/evaluation/simulator/_model_tools/models.py +30 -27
- azure/ai/evaluation/simulator/_prompty/task_query_response.prompty +6 -10
- azure/ai/evaluation/simulator/_prompty/task_simulate.prompty +6 -5
- azure/ai/evaluation/simulator/_simulator.py +302 -208
- azure/ai/evaluation/simulator/_utils.py +31 -13
- azure_ai_evaluation-1.13.3.dist-info/METADATA +939 -0
- azure_ai_evaluation-1.13.3.dist-info/RECORD +305 -0
- {azure_ai_evaluation-1.0.0b2.dist-info → azure_ai_evaluation-1.13.3.dist-info}/WHEEL +1 -1
- azure_ai_evaluation-1.13.3.dist-info/licenses/NOTICE.txt +70 -0
- azure/ai/evaluation/_evaluate/_batch_run_client/proxy_client.py +0 -71
- azure/ai/evaluation/_evaluators/_chat/_chat.py +0 -357
- azure/ai/evaluation/_evaluators/_chat/retrieval/_retrieval.py +0 -157
- azure/ai/evaluation/_evaluators/_chat/retrieval/retrieval.prompty +0 -48
- azure/ai/evaluation/_evaluators/_content_safety/_content_safety_base.py +0 -65
- azure/ai/evaluation/_evaluators/_content_safety/_content_safety_chat.py +0 -301
- azure/ai/evaluation/_evaluators/_groundedness/groundedness.prompty +0 -54
- azure/ai/evaluation/_evaluators/_protected_materials/__init__.py +0 -5
- azure/ai/evaluation/_evaluators/_protected_materials/_protected_materials.py +0 -104
- azure/ai/evaluation/simulator/_tracing.py +0 -89
- azure_ai_evaluation-1.0.0b2.dist-info/METADATA +0 -449
- azure_ai_evaluation-1.0.0b2.dist-info/RECORD +0 -99
- {azure_ai_evaluation-1.0.0b2.dist-info → azure_ai_evaluation-1.13.3.dist-info}/top_level.txt +0 -0
|
@@ -1,58 +1,99 @@
|
|
|
1
1
|
# ---------------------------------------------------------
|
|
2
2
|
# Copyright (c) Microsoft Corporation. All rights reserved.
|
|
3
3
|
# ---------------------------------------------------------
|
|
4
|
+
from typing import Dict
|
|
4
5
|
from nltk.translate.gleu_score import sentence_gleu
|
|
5
|
-
from
|
|
6
|
+
from typing_extensions import overload, override
|
|
6
7
|
|
|
7
8
|
from azure.ai.evaluation._common.utils import nltk_tokenize
|
|
8
9
|
|
|
10
|
+
from azure.ai.evaluation._evaluators._common import EvaluatorBase
|
|
11
|
+
from azure.ai.evaluation._constants import EVALUATION_PASS_FAIL_MAPPING
|
|
9
12
|
|
|
10
|
-
class _AsyncGleuScoreEvaluator:
|
|
11
|
-
def __init__(self):
|
|
12
|
-
pass
|
|
13
13
|
|
|
14
|
-
|
|
15
|
-
reference_tokens = nltk_tokenize(ground_truth)
|
|
16
|
-
hypothesis_tokens = nltk_tokenize(response)
|
|
17
|
-
|
|
18
|
-
score = sentence_gleu([reference_tokens], hypothesis_tokens)
|
|
19
|
-
|
|
20
|
-
return {
|
|
21
|
-
"gleu_score": score,
|
|
22
|
-
}
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
class GleuScoreEvaluator:
|
|
14
|
+
class GleuScoreEvaluator(EvaluatorBase):
|
|
26
15
|
"""
|
|
27
|
-
|
|
16
|
+
Calculates the GLEU (Google-BLEU) score between a response and the ground truth.
|
|
28
17
|
|
|
29
18
|
The GLEU (Google-BLEU) score evaluator measures the similarity between generated and reference texts by
|
|
30
19
|
evaluating n-gram overlap, considering both precision and recall. This balanced evaluation, designed for
|
|
31
20
|
sentence-level assessment, makes it ideal for detailed analysis of translation quality. GLEU is well-suited for
|
|
32
21
|
use cases such as machine translation, text summarization, and text generation.
|
|
33
22
|
|
|
34
|
-
|
|
23
|
+
GLEU scores range from 0 to 1, where a value of 1 represents perfect overlap between the response and
|
|
24
|
+
the ground truth and a value of 0 indicates no overlap.
|
|
35
25
|
|
|
36
|
-
|
|
26
|
+
:param threshold: The threshold for the GLEU evaluator. Default is 0.5.
|
|
27
|
+
:type threshold: float
|
|
37
28
|
|
|
38
|
-
|
|
39
|
-
result = eval_fn(
|
|
40
|
-
response="Tokyo is the capital of Japan.",
|
|
41
|
-
ground_truth="The capital of Japan is Tokyo.")
|
|
29
|
+
.. admonition:: Example:
|
|
42
30
|
|
|
43
|
-
|
|
31
|
+
.. literalinclude:: ../samples/evaluation_samples_evaluate.py
|
|
32
|
+
:start-after: [START gleu_score_evaluator]
|
|
33
|
+
:end-before: [END gleu_score_evaluator]
|
|
34
|
+
:language: python
|
|
35
|
+
:dedent: 8
|
|
36
|
+
:caption: Initialize and call a GleuScoreEvaluator.
|
|
44
37
|
|
|
45
|
-
..
|
|
38
|
+
.. admonition:: Example with Threshold:
|
|
46
39
|
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
40
|
+
.. literalinclude:: ../samples/evaluation_samples_threshold.py
|
|
41
|
+
:start-after: [START threshold_gleu_score_evaluator]
|
|
42
|
+
:end-before: [END threshold_gleu_score_evaluator]
|
|
43
|
+
:language: python
|
|
44
|
+
:dedent: 8
|
|
45
|
+
:caption: Initialize with threshold and call a GleuScoreEvaluator.
|
|
46
|
+
|
|
47
|
+
.. admonition:: Example using Azure AI Project URL:
|
|
48
|
+
|
|
49
|
+
.. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
|
|
50
|
+
:start-after: [START gleu_score_evaluator]
|
|
51
|
+
:end-before: [END gleu_score_evaluator]
|
|
52
|
+
:language: python
|
|
53
|
+
:dedent: 8
|
|
54
|
+
:caption: Initialize and call GleuScoreEvaluator using Azure AI Project URL in the following format
|
|
55
|
+
https://{resource_name}.services.ai.azure.com/api/projects/{project_name}
|
|
50
56
|
"""
|
|
51
57
|
|
|
52
|
-
|
|
53
|
-
|
|
58
|
+
id = "azureai://built-in/evaluators/gleu_score"
|
|
59
|
+
"""Evaluator identifier, experimental and to be used only with evaluation in cloud."""
|
|
54
60
|
|
|
55
|
-
|
|
61
|
+
@override
|
|
62
|
+
def __init__(self, *, threshold=0.5):
|
|
63
|
+
self._threshold = threshold
|
|
64
|
+
self._higher_is_better = True
|
|
65
|
+
super().__init__(threshold=threshold, _higher_is_better=self._higher_is_better)
|
|
66
|
+
|
|
67
|
+
@override
|
|
68
|
+
async def _do_eval(self, eval_input: Dict) -> Dict[str, float]:
|
|
69
|
+
"""Produce a glue score evaluation result.
|
|
70
|
+
|
|
71
|
+
:param eval_input: The input to the evaluation function.
|
|
72
|
+
:type eval_input: Dict
|
|
73
|
+
:return: The evaluation result.
|
|
74
|
+
:rtype: Dict
|
|
75
|
+
"""
|
|
76
|
+
ground_truth = eval_input["ground_truth"]
|
|
77
|
+
response = eval_input["response"]
|
|
78
|
+
reference_tokens = nltk_tokenize(ground_truth)
|
|
79
|
+
hypothesis_tokens = nltk_tokenize(response)
|
|
80
|
+
|
|
81
|
+
score = sentence_gleu([reference_tokens], hypothesis_tokens)
|
|
82
|
+
binary_result = False
|
|
83
|
+
if self._higher_is_better:
|
|
84
|
+
if score >= self._threshold:
|
|
85
|
+
binary_result = True
|
|
86
|
+
else:
|
|
87
|
+
if score <= self._threshold:
|
|
88
|
+
binary_result = True
|
|
89
|
+
return {
|
|
90
|
+
"gleu_score": score,
|
|
91
|
+
"gleu_result": EVALUATION_PASS_FAIL_MAPPING[binary_result],
|
|
92
|
+
"gleu_threshold": self._threshold,
|
|
93
|
+
}
|
|
94
|
+
|
|
95
|
+
@overload # type: ignore
|
|
96
|
+
def __call__(self, *, ground_truth: str, response: str):
|
|
56
97
|
"""
|
|
57
98
|
Evaluate the GLEU score between the response and the ground truth.
|
|
58
99
|
|
|
@@ -61,11 +102,23 @@ class GleuScoreEvaluator:
|
|
|
61
102
|
:keyword ground_truth: The ground truth to be compared against.
|
|
62
103
|
:paramtype ground_truth: str
|
|
63
104
|
:return: The GLEU score.
|
|
64
|
-
:rtype:
|
|
105
|
+
:rtype: Dict[str, float]
|
|
106
|
+
"""
|
|
107
|
+
|
|
108
|
+
@override
|
|
109
|
+
def __call__( # pylint: disable=docstring-missing-param
|
|
110
|
+
self,
|
|
111
|
+
*args,
|
|
112
|
+
**kwargs,
|
|
113
|
+
):
|
|
65
114
|
"""
|
|
66
|
-
|
|
67
|
-
self._async_evaluator, ground_truth=ground_truth, response=response, **kwargs
|
|
68
|
-
)
|
|
115
|
+
Evaluate the GLEU score between the response and the ground truth.
|
|
69
116
|
|
|
70
|
-
|
|
71
|
-
|
|
117
|
+
:keyword response: The response to be evaluated.
|
|
118
|
+
:paramtype response: str
|
|
119
|
+
:keyword ground_truth: The ground truth to be compared against.
|
|
120
|
+
:paramtype ground_truth: str
|
|
121
|
+
:return: The GLEU score.
|
|
122
|
+
:rtype: Dict[str, float]
|
|
123
|
+
"""
|
|
124
|
+
return super().__call__(*args, **kwargs)
|
|
@@ -1,118 +1,354 @@
|
|
|
1
1
|
# ---------------------------------------------------------
|
|
2
2
|
# Copyright (c) Microsoft Corporation. All rights reserved.
|
|
3
3
|
# ---------------------------------------------------------
|
|
4
|
+
import os, logging
|
|
5
|
+
from typing import Dict, List, Optional, Union, Any, Tuple
|
|
4
6
|
|
|
5
|
-
import
|
|
6
|
-
import
|
|
7
|
+
from typing_extensions import overload, override
|
|
8
|
+
from azure.ai.evaluation._legacy.prompty import AsyncPrompty
|
|
7
9
|
|
|
8
|
-
|
|
9
|
-
from
|
|
10
|
-
from
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
10
|
+
from azure.ai.evaluation._evaluators._common import PromptyEvaluatorBase
|
|
11
|
+
from azure.ai.evaluation._model_configurations import Conversation
|
|
12
|
+
from ..._common.utils import (
|
|
13
|
+
ErrorBlame,
|
|
14
|
+
ErrorTarget,
|
|
15
|
+
EvaluationException,
|
|
16
|
+
ErrorCategory,
|
|
17
|
+
construct_prompty_model_config,
|
|
18
|
+
validate_model_config,
|
|
19
|
+
simplify_messages,
|
|
20
|
+
)
|
|
15
21
|
|
|
16
22
|
try:
|
|
17
|
-
from ..._user_agent import
|
|
23
|
+
from ..._user_agent import UserAgentSingleton
|
|
18
24
|
except ImportError:
|
|
19
|
-
USER_AGENT = None
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
class _AsyncGroundednessEvaluator:
|
|
23
|
-
# Constants must be defined within eval's directory to be save/loadable
|
|
24
|
-
PROMPTY_FILE = "groundedness.prompty"
|
|
25
|
-
LLM_CALL_TIMEOUT = 600
|
|
26
|
-
DEFAULT_OPEN_API_VERSION = "2024-02-15-preview"
|
|
27
25
|
|
|
28
|
-
|
|
29
|
-
|
|
26
|
+
class UserAgentSingleton:
|
|
27
|
+
@property
|
|
28
|
+
def value(self) -> str:
|
|
29
|
+
return "None"
|
|
30
30
|
|
|
31
|
-
prompty_model_config = {"configuration": model_config, "parameters": {"extra_headers": {}}}
|
|
32
31
|
|
|
33
|
-
|
|
34
|
-
# https://github.com/encode/httpx/discussions/2959
|
|
35
|
-
prompty_model_config["parameters"]["extra_headers"].update({"Connection": "close"})
|
|
32
|
+
logger = logging.getLogger(__name__)
|
|
36
33
|
|
|
37
|
-
ensure_user_agent_in_aoai_model_config(
|
|
38
|
-
model_config,
|
|
39
|
-
prompty_model_config,
|
|
40
|
-
USER_AGENT,
|
|
41
|
-
)
|
|
42
34
|
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
35
|
+
class GroundednessEvaluator(PromptyEvaluatorBase[Union[str, float]]):
|
|
36
|
+
"""Evaluates groundedness score for a given query (optional), response, and context or a multi-turn conversation,
|
|
37
|
+
including reasoning.
|
|
46
38
|
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
39
|
+
The groundedness measure assesses the correspondence between claims in an AI-generated answer and the source
|
|
40
|
+
context, making sure that these claims are substantiated by the context. Even if the responses from LLM are
|
|
41
|
+
factually correct, they'll be considered ungrounded if they can't be verified against the provided sources
|
|
42
|
+
(such as your input source or your database). Use the groundedness metric when you need to verify that
|
|
43
|
+
AI-generated responses align with and are validated by the provided context.
|
|
51
44
|
|
|
52
|
-
|
|
53
|
-
msg = "Both 'response' and 'context' must be non-empty strings."
|
|
54
|
-
raise EvaluationException(
|
|
55
|
-
message=msg,
|
|
56
|
-
internal_message=msg,
|
|
57
|
-
error_category=ErrorCategory.MISSING_FIELD,
|
|
58
|
-
error_blame=ErrorBlame.USER_ERROR,
|
|
59
|
-
error_target=ErrorTarget.F1_EVALUATOR,
|
|
60
|
-
)
|
|
45
|
+
Groundedness scores range from 1 to 5, with 1 being the least grounded and 5 being the most grounded.
|
|
61
46
|
|
|
62
|
-
|
|
63
|
-
|
|
47
|
+
:param model_config: Configuration for the Azure OpenAI model.
|
|
48
|
+
:type model_config: Union[~azure.ai.evaluation.AzureOpenAIModelConfiguration,
|
|
49
|
+
~azure.ai.evaluation.OpenAIModelConfiguration]
|
|
50
|
+
:param threshold: The threshold for the groundedness evaluator. Default is 3.
|
|
51
|
+
:type threshold: int
|
|
52
|
+
:param credential: The credential for authenticating to Azure AI service.
|
|
53
|
+
:type credential: ~azure.core.credentials.TokenCredential
|
|
54
|
+
:keyword is_reasoning_model: If True, the evaluator will use reasoning model configuration (o1/o3 models).
|
|
55
|
+
This will adjust parameters like max_completion_tokens and remove unsupported parameters. Default is False.
|
|
56
|
+
:paramtype is_reasoning_model: bool
|
|
64
57
|
|
|
65
|
-
|
|
66
|
-
if llm_output:
|
|
67
|
-
match = re.search(r"\d", llm_output)
|
|
68
|
-
if match:
|
|
69
|
-
score = float(match.group())
|
|
58
|
+
.. admonition:: Example:
|
|
70
59
|
|
|
71
|
-
|
|
60
|
+
.. literalinclude:: ../samples/evaluation_samples_evaluate.py
|
|
61
|
+
:start-after: [START groundedness_evaluator]
|
|
62
|
+
:end-before: [END groundedness_evaluator]
|
|
63
|
+
:language: python
|
|
64
|
+
:dedent: 8
|
|
65
|
+
:caption: Initialize and call a GroundednessEvaluator.
|
|
72
66
|
|
|
67
|
+
.. admonition:: Example with Threshold:
|
|
73
68
|
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
69
|
+
.. literalinclude:: ../samples/evaluation_samples_threshold.py
|
|
70
|
+
:start-after: [START threshold_groundedness_evaluator]
|
|
71
|
+
:end-before: [END threshold_groundedness_evaluator]
|
|
72
|
+
:language: python
|
|
73
|
+
:dedent: 8
|
|
74
|
+
:caption: Initialize with threshold and call a GroundednessEvaluator.
|
|
77
75
|
|
|
78
|
-
|
|
79
|
-
:type model_config: Union[~azure.ai.evalation.AzureOpenAIModelConfiguration,
|
|
80
|
-
~azure.ai.evalation.OpenAIModelConfiguration]
|
|
76
|
+
.. admonition:: Example using Azure AI Project URL:
|
|
81
77
|
|
|
82
|
-
|
|
78
|
+
.. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
|
|
79
|
+
:start-after: [START groundedness_evaluator]
|
|
80
|
+
:end-before: [END groundedness_evaluator]
|
|
81
|
+
:language: python
|
|
82
|
+
:dedent: 8
|
|
83
|
+
:caption: Initialize and call GroundednessEvaluator using Azure AI Project URL in the following format
|
|
84
|
+
https://{resource_name}.services.ai.azure.com/api/projects/{project_name}
|
|
83
85
|
|
|
84
|
-
..
|
|
86
|
+
.. note::
|
|
85
87
|
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
and technological advancements.")
|
|
88
|
+
To align with our support of a diverse set of models, an output key without the `gpt_` prefix has been added.
|
|
89
|
+
To maintain backwards compatibility, the old key with the `gpt_` prefix is still be present in the output;
|
|
90
|
+
however, it is recommended to use the new key moving forward as the old key will be deprecated in the future.
|
|
91
|
+
"""
|
|
91
92
|
|
|
92
|
-
|
|
93
|
+
_PROMPTY_FILE_NO_QUERY = "groundedness_without_query.prompty"
|
|
94
|
+
_PROMPTY_FILE_WITH_QUERY = "groundedness_with_query.prompty"
|
|
95
|
+
_RESULT_KEY = "groundedness"
|
|
96
|
+
_OPTIONAL_PARAMS = ["query"]
|
|
97
|
+
_SUPPORTED_TOOLS = ["file_search"]
|
|
93
98
|
|
|
94
|
-
|
|
99
|
+
id = "azureai://built-in/evaluators/groundedness"
|
|
100
|
+
"""Evaluator identifier, experimental and to be used only with evaluation in cloud."""
|
|
95
101
|
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
102
|
+
@override
|
|
103
|
+
def __init__(self, model_config, *, threshold=3, credential=None, **kwargs):
|
|
104
|
+
current_dir = os.path.dirname(__file__)
|
|
105
|
+
prompty_path = os.path.join(current_dir, self._PROMPTY_FILE_NO_QUERY) # Default to no query
|
|
100
106
|
|
|
101
|
-
|
|
102
|
-
|
|
107
|
+
self._higher_is_better = True
|
|
108
|
+
super().__init__(
|
|
109
|
+
model_config=model_config,
|
|
110
|
+
prompty_file=prompty_path,
|
|
111
|
+
result_key=self._RESULT_KEY,
|
|
112
|
+
threshold=threshold,
|
|
113
|
+
credential=credential,
|
|
114
|
+
_higher_is_better=self._higher_is_better,
|
|
115
|
+
**kwargs,
|
|
116
|
+
)
|
|
117
|
+
self._model_config = model_config
|
|
118
|
+
self.threshold = threshold
|
|
119
|
+
# Needs to be set because it's used in call method to re-validate prompt if `query` is provided
|
|
103
120
|
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
121
|
+
@overload
|
|
122
|
+
def __call__(
|
|
123
|
+
self,
|
|
124
|
+
*,
|
|
125
|
+
response: str,
|
|
126
|
+
context: str,
|
|
127
|
+
query: Optional[str] = None,
|
|
128
|
+
) -> Dict[str, Union[str, float]]:
|
|
129
|
+
"""Evaluate groundedness for given input of response, context
|
|
107
130
|
|
|
108
131
|
:keyword response: The response to be evaluated.
|
|
109
132
|
:paramtype response: str
|
|
110
|
-
:keyword context: The context
|
|
133
|
+
:keyword context: The context to be evaluated.
|
|
111
134
|
:paramtype context: str
|
|
135
|
+
:keyword query: The query to be evaluated. Optional parameter for use with the `response`
|
|
136
|
+
and `context` parameters. If provided, a different prompt template will be used for evaluation.
|
|
137
|
+
:paramtype query: Optional[str]
|
|
138
|
+
:return: The groundedness score.
|
|
139
|
+
:rtype: Dict[str, float]
|
|
140
|
+
"""
|
|
141
|
+
|
|
142
|
+
@overload
|
|
143
|
+
def __call__(
|
|
144
|
+
self,
|
|
145
|
+
*,
|
|
146
|
+
query: str,
|
|
147
|
+
response: List[dict],
|
|
148
|
+
tool_definitions: List[dict],
|
|
149
|
+
) -> Dict[str, Union[str, float]]:
|
|
150
|
+
"""Evaluate groundedness for agent response with tool calls. Only file_search tool is supported.
|
|
151
|
+
|
|
152
|
+
:keyword query: The query to be evaluated.
|
|
153
|
+
:paramtype query: str
|
|
154
|
+
:keyword response: The response from the agent to be evaluated.
|
|
155
|
+
:paramtype response: List[dict]
|
|
156
|
+
:keyword tool_definitions: The tool definitions used by the agent.
|
|
157
|
+
:paramtype tool_definitions: List[dict]
|
|
158
|
+
:return: The groundedness score.
|
|
159
|
+
:rtype: Dict[str, Union[str, float]]
|
|
160
|
+
"""
|
|
161
|
+
|
|
162
|
+
@overload
|
|
163
|
+
def __call__(
|
|
164
|
+
self,
|
|
165
|
+
*,
|
|
166
|
+
conversation: Conversation,
|
|
167
|
+
) -> Dict[str, Union[float, Dict[str, List[Union[str, float]]]]]:
|
|
168
|
+
"""Evaluate groundedness for a conversation
|
|
169
|
+
|
|
170
|
+
:keyword conversation: The conversation to evaluate. Expected to contain a list of conversation turns under the
|
|
171
|
+
key "messages", and potentially a global context under the key "context". Conversation turns are expected
|
|
172
|
+
to be dictionaries with keys "content", "role", and possibly "context".
|
|
173
|
+
:paramtype conversation: Optional[~azure.ai.evaluation.Conversation]
|
|
112
174
|
:return: The groundedness score.
|
|
113
|
-
:rtype:
|
|
175
|
+
:rtype: Dict[str, Union[float, Dict[str, List[float]]]]
|
|
176
|
+
"""
|
|
177
|
+
|
|
178
|
+
@override
|
|
179
|
+
def __call__( # pylint: disable=docstring-missing-param
|
|
180
|
+
self,
|
|
181
|
+
*args,
|
|
182
|
+
**kwargs,
|
|
183
|
+
):
|
|
184
|
+
"""Evaluate groundedness. Accepts either a query, response, and context for a single evaluation,
|
|
185
|
+
or a conversation for a multi-turn evaluation. If the conversation has more than one turn,
|
|
186
|
+
the evaluator will aggregate the results of each turn.
|
|
187
|
+
|
|
188
|
+
:keyword query: The query to be evaluated. Mutually exclusive with `conversation`. Optional parameter for use
|
|
189
|
+
with the `response` and `context` parameters. If provided, a different prompt template will be used for
|
|
190
|
+
evaluation.
|
|
191
|
+
:paramtype query: Optional[str]
|
|
192
|
+
:keyword response: The response to be evaluated. Mutually exclusive with the `conversation` parameter.
|
|
193
|
+
:paramtype response: Optional[str]
|
|
194
|
+
:keyword context: The context to be evaluated. Mutually exclusive with the `conversation` parameter.
|
|
195
|
+
:paramtype context: Optional[str]
|
|
196
|
+
:keyword conversation: The conversation to evaluate. Expected to contain a list of conversation turns under the
|
|
197
|
+
key "messages", and potentially a global context under the key "context". Conversation turns are expected
|
|
198
|
+
to be dictionaries with keys "content", "role", and possibly "context".
|
|
199
|
+
:paramtype conversation: Optional[~azure.ai.evaluation.Conversation]
|
|
200
|
+
:return: The relevance score.
|
|
201
|
+
:rtype: Union[Dict[str, Union[str, float]], Dict[str, Union[float, Dict[str, List[Union[str, float]]]]]]
|
|
202
|
+
"""
|
|
203
|
+
|
|
204
|
+
if kwargs.get("query", None):
|
|
205
|
+
self._ensure_query_prompty_loaded()
|
|
206
|
+
|
|
207
|
+
return super().__call__(*args, **kwargs)
|
|
208
|
+
|
|
209
|
+
def _ensure_query_prompty_loaded(self):
|
|
210
|
+
"""Switch to the query prompty file if not already loaded."""
|
|
211
|
+
|
|
212
|
+
current_dir = os.path.dirname(__file__)
|
|
213
|
+
prompty_path = os.path.join(current_dir, self._PROMPTY_FILE_WITH_QUERY)
|
|
214
|
+
|
|
215
|
+
self._prompty_file = prompty_path
|
|
216
|
+
prompty_model_config = construct_prompty_model_config(
|
|
217
|
+
validate_model_config(self._model_config),
|
|
218
|
+
self._DEFAULT_OPEN_API_VERSION,
|
|
219
|
+
UserAgentSingleton().value,
|
|
220
|
+
)
|
|
221
|
+
self._flow = AsyncPrompty.load(source=self._prompty_file, model=prompty_model_config)
|
|
222
|
+
|
|
223
|
+
def _has_context(self, eval_input: dict) -> bool:
|
|
224
|
+
"""
|
|
225
|
+
Return True if eval_input contains a non-empty 'context' field.
|
|
226
|
+
Treats None, empty strings, empty lists, and lists of empty strings as no context.
|
|
227
|
+
"""
|
|
228
|
+
context = eval_input.get("context", None)
|
|
229
|
+
if not context:
|
|
230
|
+
return False
|
|
231
|
+
if context == "<>": # Special marker for no context
|
|
232
|
+
return False
|
|
233
|
+
if isinstance(context, list):
|
|
234
|
+
return any(str(c).strip() for c in context)
|
|
235
|
+
if isinstance(context, str):
|
|
236
|
+
return bool(context.strip())
|
|
237
|
+
return True
|
|
238
|
+
|
|
239
|
+
@override
|
|
240
|
+
async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]:
|
|
241
|
+
if eval_input.get("query", None) is None:
|
|
242
|
+
return await super()._do_eval(eval_input)
|
|
243
|
+
|
|
244
|
+
contains_context = self._has_context(eval_input)
|
|
245
|
+
|
|
246
|
+
simplified_query = simplify_messages(eval_input["query"], drop_tool_calls=contains_context)
|
|
247
|
+
simplified_response = simplify_messages(eval_input["response"], drop_tool_calls=False)
|
|
248
|
+
|
|
249
|
+
# Build simplified input
|
|
250
|
+
simplified_eval_input = {
|
|
251
|
+
"query": simplified_query,
|
|
252
|
+
"response": simplified_response,
|
|
253
|
+
"context": eval_input["context"],
|
|
254
|
+
}
|
|
255
|
+
|
|
256
|
+
# Replace and call the parent method
|
|
257
|
+
return await super()._do_eval(simplified_eval_input)
|
|
258
|
+
|
|
259
|
+
async def _real_call(self, **kwargs):
|
|
260
|
+
"""The asynchronous call where real end-to-end evaluation logic is performed.
|
|
261
|
+
|
|
262
|
+
:keyword kwargs: The inputs to evaluate.
|
|
263
|
+
:type kwargs: Dict
|
|
264
|
+
:return: The evaluation result.
|
|
265
|
+
:rtype: Union[DoEvalResult[T_EvalValue], AggregateResult[T_EvalValue]]
|
|
114
266
|
"""
|
|
115
|
-
|
|
267
|
+
# Convert inputs into list of evaluable inputs.
|
|
268
|
+
try:
|
|
269
|
+
return await super()._real_call(**kwargs)
|
|
270
|
+
except EvaluationException as ex:
|
|
271
|
+
if ex.category == ErrorCategory.NOT_APPLICABLE:
|
|
272
|
+
return {
|
|
273
|
+
self._result_key: self._NOT_APPLICABLE_RESULT,
|
|
274
|
+
f"{self._result_key}_result": "pass",
|
|
275
|
+
f"{self._result_key}_threshold": self.threshold,
|
|
276
|
+
f"{self._result_key}_reason": f"Supported tools were not called. Supported tools for groundedness are {self._SUPPORTED_TOOLS}.",
|
|
277
|
+
}
|
|
278
|
+
else:
|
|
279
|
+
raise ex
|
|
280
|
+
|
|
281
|
+
def _convert_kwargs_to_eval_input(self, **kwargs):
|
|
282
|
+
if kwargs.get("context") or kwargs.get("conversation"):
|
|
283
|
+
return super()._convert_kwargs_to_eval_input(**kwargs)
|
|
284
|
+
query = kwargs.get("query")
|
|
285
|
+
response = kwargs.get("response")
|
|
286
|
+
tool_definitions = kwargs.get("tool_definitions")
|
|
287
|
+
|
|
288
|
+
if query and self._prompty_file != self._PROMPTY_FILE_WITH_QUERY:
|
|
289
|
+
self._ensure_query_prompty_loaded()
|
|
290
|
+
|
|
291
|
+
if (not query) or (not response): # or not tool_definitions:
|
|
292
|
+
msg = f"{type(self).__name__}: Either 'conversation' or individual inputs must be provided. For Agent groundedness 'query' and 'response' are required."
|
|
293
|
+
raise EvaluationException(
|
|
294
|
+
message=msg,
|
|
295
|
+
blame=ErrorBlame.USER_ERROR,
|
|
296
|
+
category=ErrorCategory.INVALID_VALUE,
|
|
297
|
+
target=ErrorTarget.GROUNDEDNESS_EVALUATOR,
|
|
298
|
+
)
|
|
299
|
+
context = self._get_context_from_agent_response(response, tool_definitions)
|
|
300
|
+
|
|
301
|
+
filtered_response = self._filter_file_search_results(response)
|
|
302
|
+
return super()._convert_kwargs_to_eval_input(response=filtered_response, context=context, query=query)
|
|
303
|
+
|
|
304
|
+
def _filter_file_search_results(self, messages: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
|
|
305
|
+
"""Filter out file_search tool results from the messages."""
|
|
306
|
+
file_search_ids = self._get_file_search_tool_call_ids(messages)
|
|
307
|
+
return [
|
|
308
|
+
msg for msg in messages if not (msg.get("role") == "tool" and msg.get("tool_call_id") in file_search_ids)
|
|
309
|
+
]
|
|
310
|
+
|
|
311
|
+
def _get_context_from_agent_response(self, response, tool_definitions):
|
|
312
|
+
"""Extract context text from file_search tool results in the agent response."""
|
|
313
|
+
NO_CONTEXT = "<>"
|
|
314
|
+
context = ""
|
|
315
|
+
try:
|
|
316
|
+
logger.debug("Extracting context from response")
|
|
317
|
+
tool_calls = self._parse_tools_from_response(response=response)
|
|
318
|
+
logger.debug(f"Tool Calls parsed successfully: {tool_calls}")
|
|
319
|
+
|
|
320
|
+
if not tool_calls:
|
|
321
|
+
return NO_CONTEXT
|
|
322
|
+
|
|
323
|
+
context_lines = []
|
|
324
|
+
for tool_call in tool_calls:
|
|
325
|
+
if not isinstance(tool_call, dict) or tool_call.get("type") != "tool_call":
|
|
326
|
+
continue
|
|
327
|
+
|
|
328
|
+
tool_name = tool_call.get("name")
|
|
329
|
+
if tool_name != "file_search":
|
|
330
|
+
continue
|
|
331
|
+
|
|
332
|
+
# Extract tool results
|
|
333
|
+
for result in tool_call.get("tool_result", []):
|
|
334
|
+
results = result if isinstance(result, list) else [result]
|
|
335
|
+
for r in results:
|
|
336
|
+
file_name = r.get("file_name", "Unknown file name")
|
|
337
|
+
for content in r.get("content", []):
|
|
338
|
+
text = content.get("text")
|
|
339
|
+
if text:
|
|
340
|
+
context_lines.append(f"{file_name}:\n- {text}---\n\n")
|
|
341
|
+
|
|
342
|
+
context = "\n".join(context_lines) if len(context_lines) > 0 else None
|
|
343
|
+
|
|
344
|
+
except Exception as ex:
|
|
345
|
+
logger.debug(f"Error extracting context from agent response : {str(ex)}")
|
|
346
|
+
context = None
|
|
347
|
+
|
|
348
|
+
context = context if context else NO_CONTEXT
|
|
349
|
+
return context
|
|
116
350
|
|
|
117
|
-
def
|
|
118
|
-
|
|
351
|
+
def _get_file_search_tool_call_ids(self, query_or_response):
|
|
352
|
+
"""Return a list of tool_call_ids for file search tool calls."""
|
|
353
|
+
tool_calls = self._parse_tools_from_response(query_or_response)
|
|
354
|
+
return [tc.get("tool_call_id") for tc in tool_calls if tc.get("name") == "file_search"]
|