azure-ai-evaluation 1.0.0b2__py3-none-any.whl → 1.13.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of azure-ai-evaluation might be problematic. Click here for more details.
- azure/ai/evaluation/__init__.py +100 -5
- azure/ai/evaluation/{_evaluators/_chat → _aoai}/__init__.py +3 -2
- azure/ai/evaluation/_aoai/aoai_grader.py +140 -0
- azure/ai/evaluation/_aoai/label_grader.py +68 -0
- azure/ai/evaluation/_aoai/python_grader.py +86 -0
- azure/ai/evaluation/_aoai/score_model_grader.py +94 -0
- azure/ai/evaluation/_aoai/string_check_grader.py +66 -0
- azure/ai/evaluation/_aoai/text_similarity_grader.py +80 -0
- azure/ai/evaluation/_azure/__init__.py +3 -0
- azure/ai/evaluation/_azure/_clients.py +204 -0
- azure/ai/evaluation/_azure/_envs.py +207 -0
- azure/ai/evaluation/_azure/_models.py +227 -0
- azure/ai/evaluation/_azure/_token_manager.py +129 -0
- azure/ai/evaluation/_common/__init__.py +9 -1
- azure/ai/evaluation/{simulator/_helpers → _common}/_experimental.py +24 -9
- azure/ai/evaluation/_common/constants.py +131 -2
- azure/ai/evaluation/_common/evaluation_onedp_client.py +169 -0
- azure/ai/evaluation/_common/math.py +89 -0
- azure/ai/evaluation/_common/onedp/__init__.py +32 -0
- azure/ai/evaluation/_common/onedp/_client.py +166 -0
- azure/ai/evaluation/_common/onedp/_configuration.py +72 -0
- azure/ai/evaluation/_common/onedp/_model_base.py +1232 -0
- azure/ai/evaluation/_common/onedp/_patch.py +21 -0
- azure/ai/evaluation/_common/onedp/_serialization.py +2032 -0
- azure/ai/evaluation/_common/onedp/_types.py +21 -0
- azure/ai/evaluation/_common/onedp/_utils/__init__.py +6 -0
- azure/ai/evaluation/_common/onedp/_utils/model_base.py +1232 -0
- azure/ai/evaluation/_common/onedp/_utils/serialization.py +2032 -0
- azure/ai/evaluation/_common/onedp/_validation.py +66 -0
- azure/ai/evaluation/_common/onedp/_vendor.py +50 -0
- azure/ai/evaluation/_common/onedp/_version.py +9 -0
- azure/ai/evaluation/_common/onedp/aio/__init__.py +29 -0
- azure/ai/evaluation/_common/onedp/aio/_client.py +168 -0
- azure/ai/evaluation/_common/onedp/aio/_configuration.py +72 -0
- azure/ai/evaluation/_common/onedp/aio/_patch.py +21 -0
- azure/ai/evaluation/_common/onedp/aio/operations/__init__.py +49 -0
- azure/ai/evaluation/_common/onedp/aio/operations/_operations.py +7143 -0
- azure/ai/evaluation/_common/onedp/aio/operations/_patch.py +21 -0
- azure/ai/evaluation/_common/onedp/models/__init__.py +358 -0
- azure/ai/evaluation/_common/onedp/models/_enums.py +447 -0
- azure/ai/evaluation/_common/onedp/models/_models.py +5963 -0
- azure/ai/evaluation/_common/onedp/models/_patch.py +21 -0
- azure/ai/evaluation/_common/onedp/operations/__init__.py +49 -0
- azure/ai/evaluation/_common/onedp/operations/_operations.py +8951 -0
- azure/ai/evaluation/_common/onedp/operations/_patch.py +21 -0
- azure/ai/evaluation/_common/onedp/py.typed +1 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/__init__.py +1 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/aio/__init__.py +1 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/aio/operations/__init__.py +25 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/aio/operations/_operations.py +34 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/aio/operations/_patch.py +20 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/__init__.py +1 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/__init__.py +1 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/operations/__init__.py +22 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/operations/_operations.py +29 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/operations/_patch.py +20 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/operations/__init__.py +22 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/operations/_operations.py +29 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/operations/_patch.py +20 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/operations/__init__.py +25 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/operations/_operations.py +34 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/operations/_patch.py +20 -0
- azure/ai/evaluation/_common/rai_service.py +831 -142
- azure/ai/evaluation/_common/raiclient/__init__.py +34 -0
- azure/ai/evaluation/_common/raiclient/_client.py +128 -0
- azure/ai/evaluation/_common/raiclient/_configuration.py +87 -0
- azure/ai/evaluation/_common/raiclient/_model_base.py +1235 -0
- azure/ai/evaluation/_common/raiclient/_patch.py +20 -0
- azure/ai/evaluation/_common/raiclient/_serialization.py +2050 -0
- azure/ai/evaluation/_common/raiclient/_version.py +9 -0
- azure/ai/evaluation/_common/raiclient/aio/__init__.py +29 -0
- azure/ai/evaluation/_common/raiclient/aio/_client.py +130 -0
- azure/ai/evaluation/_common/raiclient/aio/_configuration.py +87 -0
- azure/ai/evaluation/_common/raiclient/aio/_patch.py +20 -0
- azure/ai/evaluation/_common/raiclient/aio/operations/__init__.py +25 -0
- azure/ai/evaluation/_common/raiclient/aio/operations/_operations.py +981 -0
- azure/ai/evaluation/_common/raiclient/aio/operations/_patch.py +20 -0
- azure/ai/evaluation/_common/raiclient/models/__init__.py +60 -0
- azure/ai/evaluation/_common/raiclient/models/_enums.py +18 -0
- azure/ai/evaluation/_common/raiclient/models/_models.py +651 -0
- azure/ai/evaluation/_common/raiclient/models/_patch.py +20 -0
- azure/ai/evaluation/_common/raiclient/operations/__init__.py +25 -0
- azure/ai/evaluation/_common/raiclient/operations/_operations.py +1238 -0
- azure/ai/evaluation/_common/raiclient/operations/_patch.py +20 -0
- azure/ai/evaluation/_common/raiclient/py.typed +1 -0
- azure/ai/evaluation/_common/utils.py +870 -34
- azure/ai/evaluation/_constants.py +167 -6
- azure/ai/evaluation/_converters/__init__.py +3 -0
- azure/ai/evaluation/_converters/_ai_services.py +899 -0
- azure/ai/evaluation/_converters/_models.py +467 -0
- azure/ai/evaluation/_converters/_sk_services.py +495 -0
- azure/ai/evaluation/_eval_mapping.py +83 -0
- azure/ai/evaluation/_evaluate/_batch_run/__init__.py +17 -0
- azure/ai/evaluation/_evaluate/_batch_run/_run_submitter_client.py +176 -0
- azure/ai/evaluation/_evaluate/_batch_run/batch_clients.py +82 -0
- azure/ai/evaluation/_evaluate/{_batch_run_client → _batch_run}/code_client.py +47 -25
- azure/ai/evaluation/_evaluate/{_batch_run_client/batch_run_context.py → _batch_run/eval_run_context.py} +42 -13
- azure/ai/evaluation/_evaluate/_batch_run/proxy_client.py +124 -0
- azure/ai/evaluation/_evaluate/_batch_run/target_run_context.py +62 -0
- azure/ai/evaluation/_evaluate/_eval_run.py +102 -59
- azure/ai/evaluation/_evaluate/_evaluate.py +2134 -311
- azure/ai/evaluation/_evaluate/_evaluate_aoai.py +992 -0
- azure/ai/evaluation/_evaluate/_telemetry/__init__.py +14 -99
- azure/ai/evaluation/_evaluate/_utils.py +289 -40
- azure/ai/evaluation/_evaluator_definition.py +76 -0
- azure/ai/evaluation/_evaluators/_bleu/_bleu.py +93 -42
- azure/ai/evaluation/_evaluators/_code_vulnerability/__init__.py +5 -0
- azure/ai/evaluation/_evaluators/_code_vulnerability/_code_vulnerability.py +119 -0
- azure/ai/evaluation/_evaluators/_coherence/_coherence.py +117 -91
- azure/ai/evaluation/_evaluators/_coherence/coherence.prompty +76 -39
- azure/ai/evaluation/_evaluators/_common/__init__.py +15 -0
- azure/ai/evaluation/_evaluators/_common/_base_eval.py +742 -0
- azure/ai/evaluation/_evaluators/_common/_base_multi_eval.py +63 -0
- azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +345 -0
- azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py +198 -0
- azure/ai/evaluation/_evaluators/_common/_conversation_aggregators.py +49 -0
- azure/ai/evaluation/_evaluators/_content_safety/__init__.py +0 -4
- azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py +144 -86
- azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py +138 -57
- azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py +123 -55
- azure/ai/evaluation/_evaluators/_content_safety/_sexual.py +133 -54
- azure/ai/evaluation/_evaluators/_content_safety/_violence.py +134 -54
- azure/ai/evaluation/_evaluators/_document_retrieval/__init__.py +7 -0
- azure/ai/evaluation/_evaluators/_document_retrieval/_document_retrieval.py +442 -0
- azure/ai/evaluation/_evaluators/_eci/_eci.py +49 -56
- azure/ai/evaluation/_evaluators/_f1_score/_f1_score.py +102 -60
- azure/ai/evaluation/_evaluators/_fluency/_fluency.py +115 -92
- azure/ai/evaluation/_evaluators/_fluency/fluency.prompty +66 -41
- azure/ai/evaluation/_evaluators/_gleu/_gleu.py +90 -37
- azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +318 -82
- azure/ai/evaluation/_evaluators/_groundedness/groundedness_with_query.prompty +114 -0
- azure/ai/evaluation/_evaluators/_groundedness/groundedness_without_query.prompty +104 -0
- azure/ai/evaluation/{_evaluate/_batch_run_client → _evaluators/_intent_resolution}/__init__.py +3 -4
- azure/ai/evaluation/_evaluators/_intent_resolution/_intent_resolution.py +196 -0
- azure/ai/evaluation/_evaluators/_intent_resolution/intent_resolution.prompty +275 -0
- azure/ai/evaluation/_evaluators/_meteor/_meteor.py +107 -61
- azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py +104 -77
- azure/ai/evaluation/_evaluators/_qa/_qa.py +115 -63
- azure/ai/evaluation/_evaluators/_relevance/_relevance.py +182 -98
- azure/ai/evaluation/_evaluators/_relevance/relevance.prompty +178 -49
- azure/ai/evaluation/_evaluators/_response_completeness/__init__.py +7 -0
- azure/ai/evaluation/_evaluators/_response_completeness/_response_completeness.py +202 -0
- azure/ai/evaluation/_evaluators/_response_completeness/response_completeness.prompty +84 -0
- azure/ai/evaluation/_evaluators/{_chat/retrieval → _retrieval}/__init__.py +2 -2
- azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py +148 -0
- azure/ai/evaluation/_evaluators/_retrieval/retrieval.prompty +93 -0
- azure/ai/evaluation/_evaluators/_rouge/_rouge.py +189 -50
- azure/ai/evaluation/_evaluators/_service_groundedness/__init__.py +9 -0
- azure/ai/evaluation/_evaluators/_service_groundedness/_service_groundedness.py +179 -0
- azure/ai/evaluation/_evaluators/_similarity/_similarity.py +102 -91
- azure/ai/evaluation/_evaluators/_similarity/similarity.prompty +0 -5
- azure/ai/evaluation/_evaluators/_task_adherence/__init__.py +7 -0
- azure/ai/evaluation/_evaluators/_task_adherence/_task_adherence.py +226 -0
- azure/ai/evaluation/_evaluators/_task_adherence/task_adherence.prompty +101 -0
- azure/ai/evaluation/_evaluators/_task_completion/__init__.py +7 -0
- azure/ai/evaluation/_evaluators/_task_completion/_task_completion.py +177 -0
- azure/ai/evaluation/_evaluators/_task_completion/task_completion.prompty +220 -0
- azure/ai/evaluation/_evaluators/_task_navigation_efficiency/__init__.py +7 -0
- azure/ai/evaluation/_evaluators/_task_navigation_efficiency/_task_navigation_efficiency.py +384 -0
- azure/ai/evaluation/_evaluators/_tool_call_accuracy/__init__.py +9 -0
- azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py +298 -0
- azure/ai/evaluation/_evaluators/_tool_call_accuracy/tool_call_accuracy.prompty +166 -0
- azure/ai/evaluation/_evaluators/_tool_input_accuracy/__init__.py +9 -0
- azure/ai/evaluation/_evaluators/_tool_input_accuracy/_tool_input_accuracy.py +263 -0
- azure/ai/evaluation/_evaluators/_tool_input_accuracy/tool_input_accuracy.prompty +76 -0
- azure/ai/evaluation/_evaluators/_tool_output_utilization/__init__.py +7 -0
- azure/ai/evaluation/_evaluators/_tool_output_utilization/_tool_output_utilization.py +225 -0
- azure/ai/evaluation/_evaluators/_tool_output_utilization/tool_output_utilization.prompty +221 -0
- azure/ai/evaluation/_evaluators/_tool_selection/__init__.py +9 -0
- azure/ai/evaluation/_evaluators/_tool_selection/_tool_selection.py +266 -0
- azure/ai/evaluation/_evaluators/_tool_selection/tool_selection.prompty +104 -0
- azure/ai/evaluation/_evaluators/_tool_success/__init__.py +7 -0
- azure/ai/evaluation/_evaluators/_tool_success/_tool_success.py +301 -0
- azure/ai/evaluation/_evaluators/_tool_success/tool_success.prompty +321 -0
- azure/ai/evaluation/_evaluators/_ungrounded_attributes/__init__.py +5 -0
- azure/ai/evaluation/_evaluators/_ungrounded_attributes/_ungrounded_attributes.py +102 -0
- azure/ai/evaluation/_evaluators/_xpia/xpia.py +109 -107
- azure/ai/evaluation/_exceptions.py +51 -7
- azure/ai/evaluation/_http_utils.py +210 -137
- azure/ai/evaluation/_legacy/__init__.py +3 -0
- azure/ai/evaluation/_legacy/_adapters/__init__.py +7 -0
- azure/ai/evaluation/_legacy/_adapters/_check.py +17 -0
- azure/ai/evaluation/_legacy/_adapters/_configuration.py +45 -0
- azure/ai/evaluation/_legacy/_adapters/_constants.py +10 -0
- azure/ai/evaluation/_legacy/_adapters/_errors.py +29 -0
- azure/ai/evaluation/_legacy/_adapters/_flows.py +28 -0
- azure/ai/evaluation/_legacy/_adapters/_service.py +16 -0
- azure/ai/evaluation/_legacy/_adapters/client.py +51 -0
- azure/ai/evaluation/_legacy/_adapters/entities.py +26 -0
- azure/ai/evaluation/_legacy/_adapters/tracing.py +28 -0
- azure/ai/evaluation/_legacy/_adapters/types.py +15 -0
- azure/ai/evaluation/_legacy/_adapters/utils.py +31 -0
- azure/ai/evaluation/_legacy/_batch_engine/__init__.py +9 -0
- azure/ai/evaluation/_legacy/_batch_engine/_config.py +48 -0
- azure/ai/evaluation/_legacy/_batch_engine/_engine.py +477 -0
- azure/ai/evaluation/_legacy/_batch_engine/_exceptions.py +88 -0
- azure/ai/evaluation/_legacy/_batch_engine/_openai_injector.py +132 -0
- azure/ai/evaluation/_legacy/_batch_engine/_result.py +107 -0
- azure/ai/evaluation/_legacy/_batch_engine/_run.py +127 -0
- azure/ai/evaluation/_legacy/_batch_engine/_run_storage.py +128 -0
- azure/ai/evaluation/_legacy/_batch_engine/_run_submitter.py +262 -0
- azure/ai/evaluation/_legacy/_batch_engine/_status.py +25 -0
- azure/ai/evaluation/_legacy/_batch_engine/_trace.py +97 -0
- azure/ai/evaluation/_legacy/_batch_engine/_utils.py +97 -0
- azure/ai/evaluation/_legacy/_batch_engine/_utils_deprecated.py +131 -0
- azure/ai/evaluation/_legacy/_common/__init__.py +3 -0
- azure/ai/evaluation/_legacy/_common/_async_token_provider.py +117 -0
- azure/ai/evaluation/_legacy/_common/_logging.py +292 -0
- azure/ai/evaluation/_legacy/_common/_thread_pool_executor_with_context.py +17 -0
- azure/ai/evaluation/_legacy/prompty/__init__.py +36 -0
- azure/ai/evaluation/_legacy/prompty/_connection.py +119 -0
- azure/ai/evaluation/_legacy/prompty/_exceptions.py +139 -0
- azure/ai/evaluation/_legacy/prompty/_prompty.py +430 -0
- azure/ai/evaluation/_legacy/prompty/_utils.py +663 -0
- azure/ai/evaluation/_legacy/prompty/_yaml_utils.py +99 -0
- azure/ai/evaluation/_model_configurations.py +130 -8
- azure/ai/evaluation/_safety_evaluation/__init__.py +3 -0
- azure/ai/evaluation/_safety_evaluation/_generated_rai_client.py +0 -0
- azure/ai/evaluation/_safety_evaluation/_safety_evaluation.py +917 -0
- azure/ai/evaluation/_user_agent.py +32 -1
- azure/ai/evaluation/_vendor/__init__.py +3 -0
- azure/ai/evaluation/_vendor/rouge_score/__init__.py +14 -0
- azure/ai/evaluation/_vendor/rouge_score/rouge_scorer.py +324 -0
- azure/ai/evaluation/_vendor/rouge_score/scoring.py +59 -0
- azure/ai/evaluation/_vendor/rouge_score/tokenize.py +59 -0
- azure/ai/evaluation/_vendor/rouge_score/tokenizers.py +53 -0
- azure/ai/evaluation/_version.py +2 -1
- azure/ai/evaluation/red_team/__init__.py +22 -0
- azure/ai/evaluation/red_team/_agent/__init__.py +3 -0
- azure/ai/evaluation/red_team/_agent/_agent_functions.py +261 -0
- azure/ai/evaluation/red_team/_agent/_agent_tools.py +461 -0
- azure/ai/evaluation/red_team/_agent/_agent_utils.py +89 -0
- azure/ai/evaluation/red_team/_agent/_semantic_kernel_plugin.py +228 -0
- azure/ai/evaluation/red_team/_attack_objective_generator.py +268 -0
- azure/ai/evaluation/red_team/_attack_strategy.py +49 -0
- azure/ai/evaluation/red_team/_callback_chat_target.py +115 -0
- azure/ai/evaluation/red_team/_default_converter.py +21 -0
- azure/ai/evaluation/red_team/_evaluation_processor.py +505 -0
- azure/ai/evaluation/red_team/_mlflow_integration.py +430 -0
- azure/ai/evaluation/red_team/_orchestrator_manager.py +803 -0
- azure/ai/evaluation/red_team/_red_team.py +1717 -0
- azure/ai/evaluation/red_team/_red_team_result.py +661 -0
- azure/ai/evaluation/red_team/_result_processor.py +1708 -0
- azure/ai/evaluation/red_team/_utils/__init__.py +37 -0
- azure/ai/evaluation/red_team/_utils/_rai_service_eval_chat_target.py +128 -0
- azure/ai/evaluation/red_team/_utils/_rai_service_target.py +601 -0
- azure/ai/evaluation/red_team/_utils/_rai_service_true_false_scorer.py +114 -0
- azure/ai/evaluation/red_team/_utils/constants.py +72 -0
- azure/ai/evaluation/red_team/_utils/exception_utils.py +345 -0
- azure/ai/evaluation/red_team/_utils/file_utils.py +266 -0
- azure/ai/evaluation/red_team/_utils/formatting_utils.py +365 -0
- azure/ai/evaluation/red_team/_utils/logging_utils.py +139 -0
- azure/ai/evaluation/red_team/_utils/metric_mapping.py +73 -0
- azure/ai/evaluation/red_team/_utils/objective_utils.py +46 -0
- azure/ai/evaluation/red_team/_utils/progress_utils.py +252 -0
- azure/ai/evaluation/red_team/_utils/retry_utils.py +218 -0
- azure/ai/evaluation/red_team/_utils/strategy_utils.py +218 -0
- azure/ai/evaluation/simulator/__init__.py +2 -1
- azure/ai/evaluation/simulator/_adversarial_scenario.py +26 -1
- azure/ai/evaluation/simulator/_adversarial_simulator.py +270 -144
- azure/ai/evaluation/simulator/_constants.py +12 -1
- azure/ai/evaluation/simulator/_conversation/__init__.py +151 -23
- azure/ai/evaluation/simulator/_conversation/_conversation.py +10 -6
- azure/ai/evaluation/simulator/_conversation/constants.py +1 -1
- azure/ai/evaluation/simulator/_data_sources/__init__.py +3 -0
- azure/ai/evaluation/simulator/_data_sources/grounding.json +1150 -0
- azure/ai/evaluation/simulator/_direct_attack_simulator.py +54 -75
- azure/ai/evaluation/simulator/_helpers/__init__.py +1 -2
- azure/ai/evaluation/simulator/_helpers/_language_suffix_mapping.py +1 -0
- azure/ai/evaluation/simulator/_helpers/_simulator_data_classes.py +26 -5
- azure/ai/evaluation/simulator/_indirect_attack_simulator.py +145 -104
- azure/ai/evaluation/simulator/_model_tools/__init__.py +2 -1
- azure/ai/evaluation/simulator/_model_tools/_generated_rai_client.py +225 -0
- azure/ai/evaluation/simulator/_model_tools/_identity_manager.py +80 -30
- azure/ai/evaluation/simulator/_model_tools/_proxy_completion_model.py +117 -45
- azure/ai/evaluation/simulator/_model_tools/_rai_client.py +109 -7
- azure/ai/evaluation/simulator/_model_tools/_template_handler.py +97 -33
- azure/ai/evaluation/simulator/_model_tools/models.py +30 -27
- azure/ai/evaluation/simulator/_prompty/task_query_response.prompty +6 -10
- azure/ai/evaluation/simulator/_prompty/task_simulate.prompty +6 -5
- azure/ai/evaluation/simulator/_simulator.py +302 -208
- azure/ai/evaluation/simulator/_utils.py +31 -13
- azure_ai_evaluation-1.13.3.dist-info/METADATA +939 -0
- azure_ai_evaluation-1.13.3.dist-info/RECORD +305 -0
- {azure_ai_evaluation-1.0.0b2.dist-info → azure_ai_evaluation-1.13.3.dist-info}/WHEEL +1 -1
- azure_ai_evaluation-1.13.3.dist-info/licenses/NOTICE.txt +70 -0
- azure/ai/evaluation/_evaluate/_batch_run_client/proxy_client.py +0 -71
- azure/ai/evaluation/_evaluators/_chat/_chat.py +0 -357
- azure/ai/evaluation/_evaluators/_chat/retrieval/_retrieval.py +0 -157
- azure/ai/evaluation/_evaluators/_chat/retrieval/retrieval.prompty +0 -48
- azure/ai/evaluation/_evaluators/_content_safety/_content_safety_base.py +0 -65
- azure/ai/evaluation/_evaluators/_content_safety/_content_safety_chat.py +0 -301
- azure/ai/evaluation/_evaluators/_groundedness/groundedness.prompty +0 -54
- azure/ai/evaluation/_evaluators/_protected_materials/__init__.py +0 -5
- azure/ai/evaluation/_evaluators/_protected_materials/_protected_materials.py +0 -104
- azure/ai/evaluation/simulator/_tracing.py +0 -89
- azure_ai_evaluation-1.0.0b2.dist-info/METADATA +0 -449
- azure_ai_evaluation-1.0.0b2.dist-info/RECORD +0 -99
- {azure_ai_evaluation-1.0.0b2.dist-info → azure_ai_evaluation-1.13.3.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,176 @@
|
|
|
1
|
+
# ---------------------------------------------------------
|
|
2
|
+
# Copyright (c) Microsoft Corporation. All rights reserved.
|
|
3
|
+
# ---------------------------------------------------------
|
|
4
|
+
|
|
5
|
+
import asyncio
|
|
6
|
+
import logging
|
|
7
|
+
import pandas as pd
|
|
8
|
+
import sys
|
|
9
|
+
import itertools
|
|
10
|
+
from collections import defaultdict
|
|
11
|
+
from concurrent.futures import Future
|
|
12
|
+
from os import PathLike
|
|
13
|
+
from typing import Any, Callable, Dict, Final, List, Mapping, Optional, Sequence, Union, cast
|
|
14
|
+
|
|
15
|
+
from .batch_clients import BatchClientRun, HasAsyncCallable
|
|
16
|
+
from ..._legacy._batch_engine._run_submitter import RunSubmitter
|
|
17
|
+
from ..._legacy._batch_engine._config import BatchEngineConfig
|
|
18
|
+
from ..._legacy._batch_engine._run import Run
|
|
19
|
+
from ..._legacy._adapters._constants import LINE_NUMBER
|
|
20
|
+
from ..._legacy._adapters.types import AttrDict
|
|
21
|
+
from ..._legacy._common._thread_pool_executor_with_context import ThreadPoolExecutorWithContext
|
|
22
|
+
from ..._evaluate._utils import _has_aggregator
|
|
23
|
+
from ..._constants import Prefixes, PF_BATCH_TIMEOUT_SEC
|
|
24
|
+
|
|
25
|
+
from .._utils import get_int_env_var as get_int
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
LOGGER = logging.getLogger("run")
|
|
29
|
+
MISSING_VALUE: Final[int] = sys.maxsize
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
class RunSubmitterClient:
|
|
33
|
+
def __init__(self, *, raise_on_errors: bool = False, config: Optional[BatchEngineConfig] = None) -> None:
|
|
34
|
+
if config:
|
|
35
|
+
self._config = config
|
|
36
|
+
else:
|
|
37
|
+
# Generate default config and apply any overrides to the configuration from environment variables
|
|
38
|
+
self._config = BatchEngineConfig(LOGGER, use_async=True)
|
|
39
|
+
if (val := get_int(PF_BATCH_TIMEOUT_SEC, MISSING_VALUE)) != MISSING_VALUE:
|
|
40
|
+
self._config.batch_timeout_seconds = val
|
|
41
|
+
if (val := get_int("PF_LINE_TIMEOUT_SEC", MISSING_VALUE)) != MISSING_VALUE:
|
|
42
|
+
self._config.line_timeout_seconds = val
|
|
43
|
+
if (val := get_int("PF_WORKER_COUNT", MISSING_VALUE)) != MISSING_VALUE:
|
|
44
|
+
self._config.max_concurrency = val
|
|
45
|
+
|
|
46
|
+
self._config.raise_on_error = raise_on_errors
|
|
47
|
+
|
|
48
|
+
self._thread_pool = ThreadPoolExecutorWithContext(
|
|
49
|
+
thread_name_prefix="evaluators_thread", max_workers=self._config.max_concurrency
|
|
50
|
+
)
|
|
51
|
+
|
|
52
|
+
def run(
|
|
53
|
+
self,
|
|
54
|
+
flow: Callable,
|
|
55
|
+
data: Union[str, PathLike, pd.DataFrame],
|
|
56
|
+
column_mapping: Optional[Dict[str, str]] = None,
|
|
57
|
+
evaluator_name: Optional[str] = None,
|
|
58
|
+
**kwargs: Any,
|
|
59
|
+
) -> BatchClientRun:
|
|
60
|
+
if not isinstance(data, pd.DataFrame):
|
|
61
|
+
raise ValueError("Data must be a pandas DataFrame")
|
|
62
|
+
|
|
63
|
+
# The column mappings are indexed by data to indicate they come from the data
|
|
64
|
+
# input. Update the inputs so that each entry is a dictionary with a data key
|
|
65
|
+
# that contains the original input data.
|
|
66
|
+
inputs = [{"data": input_data} for input_data in data.to_dict(orient="records")]
|
|
67
|
+
# Pass the correct previous run to the evaluator
|
|
68
|
+
run: Optional[BatchClientRun] = kwargs.pop("run", None)
|
|
69
|
+
if run:
|
|
70
|
+
kwargs["run"] = self._get_run(run)
|
|
71
|
+
|
|
72
|
+
# Try to get async function to use
|
|
73
|
+
if isinstance(flow, HasAsyncCallable):
|
|
74
|
+
flow = flow._to_async() # pylint: disable=protected-access
|
|
75
|
+
|
|
76
|
+
# Start an event loop for async execution on a thread pool thread to separate it
|
|
77
|
+
# from the caller's thread.
|
|
78
|
+
run_submitter = RunSubmitter(self._config, self._thread_pool)
|
|
79
|
+
run_future = self._thread_pool.submit(
|
|
80
|
+
asyncio.run,
|
|
81
|
+
run_submitter.submit(
|
|
82
|
+
dynamic_callable=flow,
|
|
83
|
+
inputs=inputs,
|
|
84
|
+
column_mapping=column_mapping,
|
|
85
|
+
name_prefix=evaluator_name,
|
|
86
|
+
created_on=kwargs.pop("created_on", None),
|
|
87
|
+
storage_creator=kwargs.pop("storage_creator", None),
|
|
88
|
+
**kwargs,
|
|
89
|
+
),
|
|
90
|
+
)
|
|
91
|
+
|
|
92
|
+
return run_future
|
|
93
|
+
|
|
94
|
+
def get_details(self, client_run: BatchClientRun, all_results: bool = False) -> pd.DataFrame:
|
|
95
|
+
run = self._get_run(client_run)
|
|
96
|
+
|
|
97
|
+
def concat(*dataframes: pd.DataFrame) -> pd.DataFrame:
|
|
98
|
+
return pd.concat(dataframes, axis=1, verify_integrity=True)
|
|
99
|
+
|
|
100
|
+
def to_dataframe(items: Sequence[Mapping[str, Any]], *, max_length: Optional[int] = None) -> pd.DataFrame:
|
|
101
|
+
"""Convert a sequence of dictionaries to a DataFrame.
|
|
102
|
+
|
|
103
|
+
:param items: Sequence of dictionaries to convert.
|
|
104
|
+
:type items: Sequence[Mapping[str, Any]]
|
|
105
|
+
:param max_length: Maximum number of items to include in the DataFrame. If None, include all items.
|
|
106
|
+
:type max_length: Optional[int]
|
|
107
|
+
:return: DataFrame containing the items.
|
|
108
|
+
:rtype: pd.DataFrame
|
|
109
|
+
"""
|
|
110
|
+
max_length = None if all_results else self._config.default_num_results
|
|
111
|
+
return pd.DataFrame(data=items if all_results else itertools.islice(items, max_length))
|
|
112
|
+
|
|
113
|
+
inputs = concat(
|
|
114
|
+
to_dataframe(run.inputs), to_dataframe([{LINE_NUMBER: i} for i in range(len(run.inputs))])
|
|
115
|
+
).add_prefix(Prefixes.INPUTS)
|
|
116
|
+
|
|
117
|
+
outputs = to_dataframe(run.outputs).add_prefix(Prefixes.OUTPUTS)
|
|
118
|
+
|
|
119
|
+
return concat(inputs, outputs)
|
|
120
|
+
|
|
121
|
+
def get_metrics(self, client_run: BatchClientRun) -> Dict[str, Any]:
|
|
122
|
+
run = self._get_run(client_run)
|
|
123
|
+
return {**run.metrics, **self._get_aggregated_metrics(client_run)}
|
|
124
|
+
|
|
125
|
+
def _get_aggregated_metrics(self, client_run: BatchClientRun) -> Dict[str, Any]:
|
|
126
|
+
aggregated_metrics = None
|
|
127
|
+
run = self._get_run(client_run)
|
|
128
|
+
try:
|
|
129
|
+
if _has_aggregator(run.dynamic_callable):
|
|
130
|
+
result_df = pd.DataFrame(run.outputs)
|
|
131
|
+
if len(result_df.columns) == 1 and result_df.columns[0] == "output":
|
|
132
|
+
aggregate_input = result_df["output"].tolist()
|
|
133
|
+
else:
|
|
134
|
+
aggregate_input = [AttrDict(item) for item in result_df.to_dict("records")]
|
|
135
|
+
|
|
136
|
+
aggr_func = getattr(run.dynamic_callable, "__aggregate__")
|
|
137
|
+
aggregated_metrics = aggr_func(aggregate_input)
|
|
138
|
+
|
|
139
|
+
except Exception as ex: # pylint: disable=broad-exception-caught
|
|
140
|
+
LOGGER.warning("Error calculating aggregations for evaluator, failed with error %s", ex)
|
|
141
|
+
|
|
142
|
+
if not isinstance(aggregated_metrics, dict):
|
|
143
|
+
LOGGER.warning(
|
|
144
|
+
"Aggregated metrics for evaluator is not a dictionary will not be logged as metrics",
|
|
145
|
+
)
|
|
146
|
+
return {}
|
|
147
|
+
|
|
148
|
+
return aggregated_metrics
|
|
149
|
+
|
|
150
|
+
def get_run_summary(self, client_run: BatchClientRun) -> Dict[str, Any]:
|
|
151
|
+
run = self._get_run(client_run)
|
|
152
|
+
|
|
153
|
+
total_lines = run.result.total_lines if run.result else 0
|
|
154
|
+
failed_lines = run.result.failed_lines if run.result else 0
|
|
155
|
+
|
|
156
|
+
return {
|
|
157
|
+
"status": run.status.value,
|
|
158
|
+
"duration": str(run.duration),
|
|
159
|
+
"completed_lines": total_lines - failed_lines,
|
|
160
|
+
"failed_lines": failed_lines,
|
|
161
|
+
"log_path": None,
|
|
162
|
+
"error_message": (
|
|
163
|
+
f"({run.result.error.blame.value}) {run.result.error.message}"
|
|
164
|
+
if run.result and run.result.error and run.result.error.blame
|
|
165
|
+
else None
|
|
166
|
+
),
|
|
167
|
+
"error_code": (
|
|
168
|
+
f"{run.result.error.category.value}"
|
|
169
|
+
if run.result and run.result.error and run.result.error.category
|
|
170
|
+
else None
|
|
171
|
+
),
|
|
172
|
+
}
|
|
173
|
+
|
|
174
|
+
@staticmethod
|
|
175
|
+
def _get_run(run: BatchClientRun) -> Run:
|
|
176
|
+
return cast(Future[Run], run).result()
|
|
@@ -0,0 +1,82 @@
|
|
|
1
|
+
# ---------------------------------------------------------
|
|
2
|
+
# Copyright (c) Microsoft Corporation. All rights reserved.
|
|
3
|
+
# ---------------------------------------------------------
|
|
4
|
+
|
|
5
|
+
import pandas
|
|
6
|
+
from os import PathLike
|
|
7
|
+
from typing import Any, Awaitable, Callable, Dict, Optional, Protocol, Union, runtime_checkable
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class BatchClientRun(Protocol):
|
|
11
|
+
"""The protocol for the batch client run."""
|
|
12
|
+
|
|
13
|
+
pass
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
@runtime_checkable
|
|
17
|
+
class HasAsyncCallable(Protocol):
|
|
18
|
+
"""The protocol for an object that has an async callable."""
|
|
19
|
+
|
|
20
|
+
def _to_async(self) -> Callable[[Any, Any], Awaitable[Any]]: ...
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class BatchClient(Protocol):
|
|
24
|
+
"""The protocol for the batch client. This allows for running a flow on a data source
|
|
25
|
+
and getting the details of the run."""
|
|
26
|
+
|
|
27
|
+
def run(
|
|
28
|
+
self,
|
|
29
|
+
flow: Callable,
|
|
30
|
+
data: Union[str, PathLike, pandas.DataFrame],
|
|
31
|
+
column_mapping: Optional[Dict[str, str]] = None,
|
|
32
|
+
evaluator_name: Optional[str] = None,
|
|
33
|
+
**kwargs: Any,
|
|
34
|
+
) -> BatchClientRun:
|
|
35
|
+
"""Run the given flow on the data with the given column mapping.
|
|
36
|
+
|
|
37
|
+
:param flow: The flow to run.
|
|
38
|
+
:type flow: Union[Callable, HasAsyncCallable]
|
|
39
|
+
:param data: The JSONL file containing the data to run the flow on,
|
|
40
|
+
or the loaded data
|
|
41
|
+
:type data: Union[str, PathLike]
|
|
42
|
+
:param column_mapping: The column mapping to use.
|
|
43
|
+
:type column_mapping: Mapping[str, str]
|
|
44
|
+
:param name: The name of the run.
|
|
45
|
+
:type name: Optional[str]
|
|
46
|
+
:param kwargs: Additional keyword arguments to pass to the flow.
|
|
47
|
+
:return: The result of the batch client run.
|
|
48
|
+
:rtype: BatchClientRun
|
|
49
|
+
"""
|
|
50
|
+
...
|
|
51
|
+
|
|
52
|
+
def get_details(self, client_run: BatchClientRun, all_results: bool = False) -> pandas.DataFrame:
|
|
53
|
+
"""Get the details of the run.
|
|
54
|
+
|
|
55
|
+
:param client_run: The run to get the details of.
|
|
56
|
+
:type client_run: BatchClientRun
|
|
57
|
+
:param all_results: Whether to get all results.
|
|
58
|
+
:type all_results: bool
|
|
59
|
+
:return: The details of the run.
|
|
60
|
+
:rtype: pandas.DataFrame
|
|
61
|
+
"""
|
|
62
|
+
...
|
|
63
|
+
|
|
64
|
+
def get_metrics(self, client_run: BatchClientRun) -> Dict[str, Any]:
|
|
65
|
+
"""Get the metrics of the run.
|
|
66
|
+
|
|
67
|
+
:param client_run: The run to get the metrics of.
|
|
68
|
+
:type client_run: BatchClientRun
|
|
69
|
+
:return: The metrics of the run.
|
|
70
|
+
:rtype: Mapping[str, Any]
|
|
71
|
+
"""
|
|
72
|
+
...
|
|
73
|
+
|
|
74
|
+
def get_run_summary(self, client_run: BatchClientRun) -> Dict[str, Any]:
|
|
75
|
+
"""Get the summary of the run.
|
|
76
|
+
|
|
77
|
+
:param client_run: The run to get the summary of.
|
|
78
|
+
:type client_run: BatchClientRun
|
|
79
|
+
:return: The summary of the run.
|
|
80
|
+
:rtype: Mapping[str, Any]
|
|
81
|
+
"""
|
|
82
|
+
...
|
|
@@ -5,42 +5,49 @@ import inspect
|
|
|
5
5
|
import json
|
|
6
6
|
import logging
|
|
7
7
|
import os
|
|
8
|
-
from
|
|
9
|
-
from typing import Callable, Dict, Optional, Union
|
|
8
|
+
from concurrent.futures import Future
|
|
9
|
+
from typing import Any, Callable, Dict, Optional, Sequence, Union, cast
|
|
10
10
|
|
|
11
11
|
import pandas as pd
|
|
12
|
-
from
|
|
13
|
-
from
|
|
12
|
+
from azure.ai.evaluation._legacy._adapters.types import AttrDict
|
|
13
|
+
from azure.ai.evaluation._legacy._adapters.tracing import ThreadPoolExecutorWithContext as ThreadPoolExecutor
|
|
14
14
|
|
|
15
15
|
from azure.ai.evaluation._evaluate._utils import _apply_column_mapping, _has_aggregator, get_int_env_var, load_jsonl
|
|
16
16
|
from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
|
|
17
17
|
|
|
18
18
|
from ..._constants import PF_BATCH_TIMEOUT_SEC, PF_BATCH_TIMEOUT_SEC_DEFAULT
|
|
19
|
+
from .batch_clients import BatchClientRun
|
|
19
20
|
|
|
20
21
|
LOGGER = logging.getLogger(__name__)
|
|
21
22
|
|
|
22
23
|
|
|
23
24
|
class CodeRun:
|
|
24
25
|
def __init__(
|
|
25
|
-
self,
|
|
26
|
-
|
|
26
|
+
self,
|
|
27
|
+
*,
|
|
28
|
+
run: Future,
|
|
29
|
+
input_data,
|
|
30
|
+
evaluator_name: Optional[str] = None,
|
|
31
|
+
aggregator: Callable[["CodeRun"], Future],
|
|
32
|
+
**kwargs, # pylint: disable=unused-argument
|
|
33
|
+
) -> None:
|
|
27
34
|
self.run = run
|
|
28
35
|
self.evaluator_name = evaluator_name if evaluator_name is not None else ""
|
|
29
36
|
self.input_data = input_data
|
|
30
|
-
self.aggregated_metrics =
|
|
37
|
+
self.aggregated_metrics = aggregator(self)
|
|
31
38
|
|
|
32
|
-
def get_result_df(self, exclude_inputs=False):
|
|
39
|
+
def get_result_df(self, exclude_inputs: bool = False) -> pd.DataFrame:
|
|
33
40
|
batch_run_timeout = get_int_env_var(PF_BATCH_TIMEOUT_SEC, PF_BATCH_TIMEOUT_SEC_DEFAULT)
|
|
34
|
-
result_df = self.run.result(timeout=batch_run_timeout)
|
|
41
|
+
result_df = cast(pd.DataFrame, self.run.result(timeout=batch_run_timeout))
|
|
35
42
|
if exclude_inputs:
|
|
36
43
|
result_df = result_df.drop(columns=[col for col in result_df.columns if col.startswith("inputs.")])
|
|
37
44
|
return result_df
|
|
38
45
|
|
|
39
|
-
def get_aggregated_metrics(self):
|
|
46
|
+
def get_aggregated_metrics(self) -> Dict[str, Any]:
|
|
40
47
|
try:
|
|
41
48
|
batch_run_timeout = get_int_env_var(PF_BATCH_TIMEOUT_SEC, PF_BATCH_TIMEOUT_SEC_DEFAULT)
|
|
42
|
-
aggregated_metrics = (
|
|
43
|
-
self.aggregated_metrics.result(timeout=batch_run_timeout)
|
|
49
|
+
aggregated_metrics: Optional[Any] = (
|
|
50
|
+
cast(Dict, self.aggregated_metrics.result(timeout=batch_run_timeout))
|
|
44
51
|
if self.aggregated_metrics is not None
|
|
45
52
|
else None
|
|
46
53
|
)
|
|
@@ -77,7 +84,7 @@ class CodeClient: # pylint: disable=client-accepts-api-version-keyword
|
|
|
77
84
|
for param in inspect.signature(evaluator).parameters.values()
|
|
78
85
|
if param.name not in ["args", "kwargs"]
|
|
79
86
|
}
|
|
80
|
-
for value in input_df.to_dict("records"):
|
|
87
|
+
for value in cast(Sequence[Dict[str, Any]], input_df.to_dict("records")):
|
|
81
88
|
# Filter out only the parameters that are present in the input data
|
|
82
89
|
# if no parameters then pass data as is
|
|
83
90
|
filtered_values = {k: v for k, v in value.items() if k in parameters} if len(parameters) > 0 else value
|
|
@@ -104,10 +111,10 @@ class CodeClient: # pylint: disable=client-accepts-api-version-keyword
|
|
|
104
111
|
verify_integrity=True,
|
|
105
112
|
)
|
|
106
113
|
|
|
107
|
-
|
|
114
|
+
@staticmethod
|
|
115
|
+
def _calculate_aggregations(evaluator: Callable, run: CodeRun) -> Any:
|
|
108
116
|
try:
|
|
109
117
|
if _has_aggregator(evaluator):
|
|
110
|
-
aggregate_input = None
|
|
111
118
|
evaluator_output = run.get_result_df(exclude_inputs=True)
|
|
112
119
|
if len(evaluator_output.columns) == 1 and evaluator_output.columns[0] == "output":
|
|
113
120
|
aggregate_input = evaluator_output["output"].tolist()
|
|
@@ -126,10 +133,10 @@ class CodeClient: # pylint: disable=client-accepts-api-version-keyword
|
|
|
126
133
|
def run(
|
|
127
134
|
self, # pylint: disable=unused-argument
|
|
128
135
|
flow: Callable,
|
|
129
|
-
data: Union[os.PathLike,
|
|
130
|
-
evaluator_name: Optional[str] = None,
|
|
136
|
+
data: Union[str, os.PathLike, pd.DataFrame],
|
|
131
137
|
column_mapping: Optional[Dict[str, str]] = None,
|
|
132
|
-
|
|
138
|
+
evaluator_name: Optional[str] = None,
|
|
139
|
+
**kwargs: Any,
|
|
133
140
|
) -> CodeRun:
|
|
134
141
|
input_df = data
|
|
135
142
|
if not isinstance(input_df, pd.DataFrame):
|
|
@@ -150,23 +157,38 @@ class CodeClient: # pylint: disable=client-accepts-api-version-keyword
|
|
|
150
157
|
evaluator=flow,
|
|
151
158
|
input_df=input_df,
|
|
152
159
|
column_mapping=column_mapping,
|
|
160
|
+
evaluator_name=evaluator_name or "",
|
|
161
|
+
)
|
|
162
|
+
|
|
163
|
+
return CodeRun(
|
|
164
|
+
run=eval_future,
|
|
165
|
+
input_data=data,
|
|
153
166
|
evaluator_name=evaluator_name,
|
|
167
|
+
aggregator=lambda code_run: self._thread_pool.submit(
|
|
168
|
+
self._calculate_aggregations, evaluator=flow, run=code_run
|
|
169
|
+
),
|
|
154
170
|
)
|
|
155
|
-
run = CodeRun(run=eval_future, input_data=data, evaluator_name=evaluator_name, aggregated_metrics=None)
|
|
156
|
-
aggregation_future = self._thread_pool.submit(self._calculate_aggregations, evaluator=flow, run=run)
|
|
157
|
-
run.aggregated_metrics = aggregation_future
|
|
158
|
-
return run
|
|
159
171
|
|
|
160
|
-
def get_details(self,
|
|
172
|
+
def get_details(self, client_run: BatchClientRun, all_results: bool = False) -> pd.DataFrame:
|
|
173
|
+
run = self._get_result(client_run)
|
|
161
174
|
result_df = run.get_result_df(exclude_inputs=not all_results)
|
|
162
175
|
return result_df
|
|
163
176
|
|
|
164
|
-
def get_metrics(self,
|
|
177
|
+
def get_metrics(self, client_run: BatchClientRun) -> Dict[str, Any]:
|
|
178
|
+
run = self._get_result(client_run)
|
|
165
179
|
try:
|
|
166
180
|
aggregated_metrics = run.get_aggregated_metrics()
|
|
167
181
|
print("Aggregated metrics")
|
|
168
182
|
print(aggregated_metrics)
|
|
169
183
|
except Exception as ex: # pylint: disable=broad-exception-caught
|
|
170
184
|
LOGGER.debug("Error calculating metrics for evaluator %s, failed with error %s", run.evaluator_name, ex)
|
|
171
|
-
return
|
|
185
|
+
return {}
|
|
172
186
|
return aggregated_metrics
|
|
187
|
+
|
|
188
|
+
def get_run_summary(self, client_run: BatchClientRun) -> Any: # pylint: disable=unused-argument
|
|
189
|
+
# Not implemented
|
|
190
|
+
return None
|
|
191
|
+
|
|
192
|
+
@staticmethod
|
|
193
|
+
def _get_result(run: BatchClientRun) -> CodeRun:
|
|
194
|
+
return cast(CodeRun, run)
|
|
@@ -2,47 +2,61 @@
|
|
|
2
2
|
# Copyright (c) Microsoft Corporation. All rights reserved.
|
|
3
3
|
# ---------------------------------------------------------
|
|
4
4
|
import os
|
|
5
|
-
|
|
6
|
-
from
|
|
7
|
-
|
|
8
|
-
from
|
|
5
|
+
import types
|
|
6
|
+
from typing import Optional, Type, Union
|
|
7
|
+
|
|
8
|
+
from azure.ai.evaluation._legacy._adapters._constants import PF_FLOW_ENTRY_IN_TMP, PF_FLOW_META_LOAD_IN_SUBPROCESS
|
|
9
|
+
from azure.ai.evaluation._legacy._adapters.utils import ClientUserAgentUtil
|
|
10
|
+
from azure.ai.evaluation._legacy._adapters.tracing import inject_openai_api, recover_openai_api
|
|
11
|
+
from azure.ai.evaluation._legacy._batch_engine._openai_injector import (
|
|
12
|
+
inject_openai_api as ported_inject_openai_api,
|
|
13
|
+
recover_openai_api as ported_recover_openai_api,
|
|
14
|
+
)
|
|
9
15
|
|
|
10
16
|
from azure.ai.evaluation._constants import (
|
|
11
17
|
OTEL_EXPORTER_OTLP_TRACES_TIMEOUT,
|
|
12
18
|
OTEL_EXPORTER_OTLP_TRACES_TIMEOUT_DEFAULT,
|
|
13
19
|
PF_BATCH_TIMEOUT_SEC,
|
|
14
20
|
PF_BATCH_TIMEOUT_SEC_DEFAULT,
|
|
21
|
+
PF_DISABLE_TRACING,
|
|
15
22
|
)
|
|
16
23
|
|
|
17
|
-
from ..._user_agent import
|
|
24
|
+
from ..._user_agent import UserAgentSingleton
|
|
18
25
|
from .._utils import set_event_loop_policy
|
|
26
|
+
from .batch_clients import BatchClient
|
|
27
|
+
from ._run_submitter_client import RunSubmitterClient
|
|
19
28
|
from .code_client import CodeClient
|
|
20
29
|
from .proxy_client import ProxyClient
|
|
21
30
|
|
|
22
31
|
|
|
23
|
-
class
|
|
24
|
-
"""Context manager for batch run
|
|
32
|
+
class EvalRunContext:
|
|
33
|
+
"""Context manager for eval batch run.
|
|
25
34
|
|
|
26
35
|
:param client: The client to run in the context.
|
|
27
36
|
:type client: Union[
|
|
28
|
-
~azure.ai.evaluation._evaluate.
|
|
29
|
-
~azure.ai.evaluation._evaluate.
|
|
37
|
+
~azure.ai.evaluation._evaluate._batch_run.code_client.CodeClient,
|
|
38
|
+
~azure.ai.evaluation._evaluate._batch_run.proxy_client.ProxyClient
|
|
30
39
|
]
|
|
31
40
|
"""
|
|
32
41
|
|
|
33
|
-
def __init__(self, client) -> None:
|
|
42
|
+
def __init__(self, client: BatchClient) -> None:
|
|
34
43
|
self.client = client
|
|
35
44
|
self._is_batch_timeout_set_by_system = False
|
|
36
45
|
self._is_otel_timeout_set_by_system = False
|
|
46
|
+
self._original_cwd = os.getcwd()
|
|
47
|
+
|
|
48
|
+
def __enter__(self) -> None:
|
|
49
|
+
# Preserve current working directory, as PF may change it without restoring it afterward
|
|
50
|
+
self._original_cwd = os.getcwd()
|
|
37
51
|
|
|
38
|
-
def __enter__(self):
|
|
39
52
|
if isinstance(self.client, CodeClient):
|
|
40
|
-
ClientUserAgentUtil.append_user_agent(
|
|
53
|
+
ClientUserAgentUtil.append_user_agent(UserAgentSingleton().value)
|
|
41
54
|
inject_openai_api()
|
|
42
55
|
|
|
43
56
|
if isinstance(self.client, ProxyClient):
|
|
44
57
|
os.environ[PF_FLOW_ENTRY_IN_TMP] = "true"
|
|
45
58
|
os.environ[PF_FLOW_META_LOAD_IN_SUBPROCESS] = "false"
|
|
59
|
+
os.environ[PF_DISABLE_TRACING] = "true"
|
|
46
60
|
|
|
47
61
|
if os.environ.get(PF_BATCH_TIMEOUT_SEC) is None:
|
|
48
62
|
os.environ[PF_BATCH_TIMEOUT_SEC] = str(PF_BATCH_TIMEOUT_SEC_DEFAULT)
|
|
@@ -56,13 +70,25 @@ class BatchRunContext:
|
|
|
56
70
|
# For addressing the issue of asyncio event loop closed on Windows
|
|
57
71
|
set_event_loop_policy()
|
|
58
72
|
|
|
59
|
-
|
|
73
|
+
if isinstance(self.client, RunSubmitterClient):
|
|
74
|
+
set_event_loop_policy()
|
|
75
|
+
ported_inject_openai_api()
|
|
76
|
+
|
|
77
|
+
def __exit__(
|
|
78
|
+
self,
|
|
79
|
+
exc_type: Optional[Type[BaseException]],
|
|
80
|
+
exc_value: Optional[BaseException],
|
|
81
|
+
exc_tb: Optional[types.TracebackType],
|
|
82
|
+
) -> None:
|
|
83
|
+
os.chdir(self._original_cwd)
|
|
84
|
+
|
|
60
85
|
if isinstance(self.client, CodeClient):
|
|
61
86
|
recover_openai_api()
|
|
62
87
|
|
|
63
88
|
if isinstance(self.client, ProxyClient):
|
|
64
89
|
os.environ.pop(PF_FLOW_ENTRY_IN_TMP, None)
|
|
65
90
|
os.environ.pop(PF_FLOW_META_LOAD_IN_SUBPROCESS, None)
|
|
91
|
+
os.environ.pop(PF_DISABLE_TRACING, None)
|
|
66
92
|
|
|
67
93
|
if self._is_batch_timeout_set_by_system:
|
|
68
94
|
os.environ.pop(PF_BATCH_TIMEOUT_SEC, None)
|
|
@@ -71,3 +97,6 @@ class BatchRunContext:
|
|
|
71
97
|
if self._is_otel_timeout_set_by_system:
|
|
72
98
|
os.environ.pop(OTEL_EXPORTER_OTLP_TRACES_TIMEOUT, None)
|
|
73
99
|
self._is_otel_timeout_set_by_system = False
|
|
100
|
+
|
|
101
|
+
if isinstance(self.client, RunSubmitterClient):
|
|
102
|
+
ported_recover_openai_api()
|
|
@@ -0,0 +1,124 @@
|
|
|
1
|
+
# ---------------------------------------------------------
|
|
2
|
+
# Copyright (c) Microsoft Corporation. All rights reserved.
|
|
3
|
+
# ---------------------------------------------------------
|
|
4
|
+
|
|
5
|
+
# pylint: disable=protected-access
|
|
6
|
+
|
|
7
|
+
import inspect
|
|
8
|
+
import logging
|
|
9
|
+
import math
|
|
10
|
+
import os
|
|
11
|
+
from datetime import datetime
|
|
12
|
+
from collections import OrderedDict
|
|
13
|
+
from concurrent.futures import Future
|
|
14
|
+
from typing import Any, Callable, Dict, Optional, Union, cast
|
|
15
|
+
|
|
16
|
+
from azure.ai.evaluation._legacy._adapters.entities import Run
|
|
17
|
+
from azure.ai.evaluation._legacy._adapters._configuration import Configuration
|
|
18
|
+
from azure.ai.evaluation._legacy._adapters.client import PFClient
|
|
19
|
+
from azure.ai.evaluation._legacy._adapters.tracing import ThreadPoolExecutorWithContext
|
|
20
|
+
import pandas as pd
|
|
21
|
+
|
|
22
|
+
from azure.ai.evaluation._evaluate._batch_run.batch_clients import BatchClientRun, HasAsyncCallable
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
Configuration.get_instance().set_config("trace.destination", "none")
|
|
26
|
+
LOGGER = logging.getLogger(__name__)
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
class ProxyRun:
|
|
30
|
+
def __init__(self, run: Future, **kwargs) -> None: # pylint: disable=unused-argument
|
|
31
|
+
self.run = run
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
class ProxyClient: # pylint: disable=client-accepts-api-version-keyword
|
|
35
|
+
def __init__( # pylint: disable=missing-client-constructor-parameter-credential
|
|
36
|
+
self,
|
|
37
|
+
**kwargs: Any,
|
|
38
|
+
) -> None:
|
|
39
|
+
self._pf_client = PFClient(**kwargs)
|
|
40
|
+
self._thread_pool = ThreadPoolExecutorWithContext(thread_name_prefix="evaluators_thread")
|
|
41
|
+
|
|
42
|
+
def run(
|
|
43
|
+
self,
|
|
44
|
+
flow: Callable,
|
|
45
|
+
data: Union[str, os.PathLike, pd.DataFrame],
|
|
46
|
+
column_mapping: Optional[Dict[str, str]] = None,
|
|
47
|
+
evaluator_name: Optional[str] = None,
|
|
48
|
+
**kwargs: Any,
|
|
49
|
+
) -> ProxyRun:
|
|
50
|
+
if isinstance(data, pd.DataFrame):
|
|
51
|
+
raise ValueError("Data cannot be a pandas DataFrame")
|
|
52
|
+
|
|
53
|
+
flow_to_run: Callable = flow
|
|
54
|
+
if os.getenv("AI_EVALS_BATCH_USE_ASYNC", "true").lower() == "true" and isinstance(flow, HasAsyncCallable):
|
|
55
|
+
flow_to_run = flow._to_async() # pylint: disable=protected-access
|
|
56
|
+
|
|
57
|
+
name: str = kwargs.pop("name", "")
|
|
58
|
+
if not name:
|
|
59
|
+
name = f"azure_ai_evaluation_evaluators_{evaluator_name}_{datetime.now().strftime('%Y%m%d_%H%M%S_%f')}"
|
|
60
|
+
|
|
61
|
+
# Pass the correct previous run to the evaluator
|
|
62
|
+
run: Optional[BatchClientRun] = kwargs.pop("run", None)
|
|
63
|
+
if run:
|
|
64
|
+
kwargs["run"] = self.get_result(run)
|
|
65
|
+
|
|
66
|
+
batch_use_async = self._should_batch_use_async(flow_to_run)
|
|
67
|
+
eval_future = self._thread_pool.submit(
|
|
68
|
+
self._pf_client.run,
|
|
69
|
+
flow_to_run,
|
|
70
|
+
data=data,
|
|
71
|
+
column_mapping=column_mapping, # type: ignore
|
|
72
|
+
batch_use_async=batch_use_async,
|
|
73
|
+
name=name,
|
|
74
|
+
**kwargs,
|
|
75
|
+
)
|
|
76
|
+
return ProxyRun(run=eval_future)
|
|
77
|
+
|
|
78
|
+
def get_details(self, client_run: BatchClientRun, all_results: bool = False) -> pd.DataFrame:
|
|
79
|
+
run: Run = self.get_result(client_run)
|
|
80
|
+
result_df = self._pf_client.get_details(run, all_results=all_results)
|
|
81
|
+
result_df.replace("(Failed)", math.nan, inplace=True)
|
|
82
|
+
return result_df
|
|
83
|
+
|
|
84
|
+
def get_metrics(self, client_run: BatchClientRun) -> Dict[str, Any]:
|
|
85
|
+
run: Run = self.get_result(client_run)
|
|
86
|
+
return self._pf_client.get_metrics(run)
|
|
87
|
+
|
|
88
|
+
def get_run_summary(self, client_run: BatchClientRun) -> Dict[str, Any]:
|
|
89
|
+
run: Run = self.get_result(client_run)
|
|
90
|
+
|
|
91
|
+
# pylint: disable=protected-access
|
|
92
|
+
completed_lines = run._properties.get("system_metrics", {}).get("__pf__.lines.completed", "NA")
|
|
93
|
+
failed_lines = run._properties.get("system_metrics", {}).get("__pf__.lines.failed", "NA")
|
|
94
|
+
|
|
95
|
+
# Update status to "Completed with Errors" if the original status is "Completed" and there are failed lines
|
|
96
|
+
if run.status == "Completed" and failed_lines != "NA" and int(failed_lines) > 0:
|
|
97
|
+
status = "Completed with Errors"
|
|
98
|
+
else:
|
|
99
|
+
status = run.status
|
|
100
|
+
|
|
101
|
+
# Return the ordered dictionary with the updated status
|
|
102
|
+
return OrderedDict(
|
|
103
|
+
[
|
|
104
|
+
("status", status),
|
|
105
|
+
("duration", str((run._end_time or run._created_on) - run._created_on)),
|
|
106
|
+
("completed_lines", completed_lines),
|
|
107
|
+
("failed_lines", failed_lines),
|
|
108
|
+
("log_path", str(run._output_path)),
|
|
109
|
+
]
|
|
110
|
+
)
|
|
111
|
+
|
|
112
|
+
@staticmethod
|
|
113
|
+
def get_result(run: BatchClientRun) -> Run:
|
|
114
|
+
return cast(ProxyRun, run).run.result()
|
|
115
|
+
|
|
116
|
+
@staticmethod
|
|
117
|
+
def _should_batch_use_async(flow):
|
|
118
|
+
if os.getenv("AI_EVALS_BATCH_USE_ASYNC", "true").lower() == "true":
|
|
119
|
+
if hasattr(flow, "__call__") and inspect.iscoroutinefunction(flow.__call__):
|
|
120
|
+
return True
|
|
121
|
+
if inspect.iscoroutinefunction(flow):
|
|
122
|
+
return True
|
|
123
|
+
return False
|
|
124
|
+
return False
|