azure-ai-evaluation 1.0.0b2__py3-none-any.whl → 1.13.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of azure-ai-evaluation might be problematic. Click here for more details.
- azure/ai/evaluation/__init__.py +100 -5
- azure/ai/evaluation/{_evaluators/_chat → _aoai}/__init__.py +3 -2
- azure/ai/evaluation/_aoai/aoai_grader.py +140 -0
- azure/ai/evaluation/_aoai/label_grader.py +68 -0
- azure/ai/evaluation/_aoai/python_grader.py +86 -0
- azure/ai/evaluation/_aoai/score_model_grader.py +94 -0
- azure/ai/evaluation/_aoai/string_check_grader.py +66 -0
- azure/ai/evaluation/_aoai/text_similarity_grader.py +80 -0
- azure/ai/evaluation/_azure/__init__.py +3 -0
- azure/ai/evaluation/_azure/_clients.py +204 -0
- azure/ai/evaluation/_azure/_envs.py +207 -0
- azure/ai/evaluation/_azure/_models.py +227 -0
- azure/ai/evaluation/_azure/_token_manager.py +129 -0
- azure/ai/evaluation/_common/__init__.py +9 -1
- azure/ai/evaluation/{simulator/_helpers → _common}/_experimental.py +24 -9
- azure/ai/evaluation/_common/constants.py +131 -2
- azure/ai/evaluation/_common/evaluation_onedp_client.py +169 -0
- azure/ai/evaluation/_common/math.py +89 -0
- azure/ai/evaluation/_common/onedp/__init__.py +32 -0
- azure/ai/evaluation/_common/onedp/_client.py +166 -0
- azure/ai/evaluation/_common/onedp/_configuration.py +72 -0
- azure/ai/evaluation/_common/onedp/_model_base.py +1232 -0
- azure/ai/evaluation/_common/onedp/_patch.py +21 -0
- azure/ai/evaluation/_common/onedp/_serialization.py +2032 -0
- azure/ai/evaluation/_common/onedp/_types.py +21 -0
- azure/ai/evaluation/_common/onedp/_utils/__init__.py +6 -0
- azure/ai/evaluation/_common/onedp/_utils/model_base.py +1232 -0
- azure/ai/evaluation/_common/onedp/_utils/serialization.py +2032 -0
- azure/ai/evaluation/_common/onedp/_validation.py +66 -0
- azure/ai/evaluation/_common/onedp/_vendor.py +50 -0
- azure/ai/evaluation/_common/onedp/_version.py +9 -0
- azure/ai/evaluation/_common/onedp/aio/__init__.py +29 -0
- azure/ai/evaluation/_common/onedp/aio/_client.py +168 -0
- azure/ai/evaluation/_common/onedp/aio/_configuration.py +72 -0
- azure/ai/evaluation/_common/onedp/aio/_patch.py +21 -0
- azure/ai/evaluation/_common/onedp/aio/operations/__init__.py +49 -0
- azure/ai/evaluation/_common/onedp/aio/operations/_operations.py +7143 -0
- azure/ai/evaluation/_common/onedp/aio/operations/_patch.py +21 -0
- azure/ai/evaluation/_common/onedp/models/__init__.py +358 -0
- azure/ai/evaluation/_common/onedp/models/_enums.py +447 -0
- azure/ai/evaluation/_common/onedp/models/_models.py +5963 -0
- azure/ai/evaluation/_common/onedp/models/_patch.py +21 -0
- azure/ai/evaluation/_common/onedp/operations/__init__.py +49 -0
- azure/ai/evaluation/_common/onedp/operations/_operations.py +8951 -0
- azure/ai/evaluation/_common/onedp/operations/_patch.py +21 -0
- azure/ai/evaluation/_common/onedp/py.typed +1 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/__init__.py +1 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/aio/__init__.py +1 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/aio/operations/__init__.py +25 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/aio/operations/_operations.py +34 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/aio/operations/_patch.py +20 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/__init__.py +1 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/__init__.py +1 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/operations/__init__.py +22 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/operations/_operations.py +29 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/operations/_patch.py +20 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/operations/__init__.py +22 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/operations/_operations.py +29 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/operations/_patch.py +20 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/operations/__init__.py +25 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/operations/_operations.py +34 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/operations/_patch.py +20 -0
- azure/ai/evaluation/_common/rai_service.py +831 -142
- azure/ai/evaluation/_common/raiclient/__init__.py +34 -0
- azure/ai/evaluation/_common/raiclient/_client.py +128 -0
- azure/ai/evaluation/_common/raiclient/_configuration.py +87 -0
- azure/ai/evaluation/_common/raiclient/_model_base.py +1235 -0
- azure/ai/evaluation/_common/raiclient/_patch.py +20 -0
- azure/ai/evaluation/_common/raiclient/_serialization.py +2050 -0
- azure/ai/evaluation/_common/raiclient/_version.py +9 -0
- azure/ai/evaluation/_common/raiclient/aio/__init__.py +29 -0
- azure/ai/evaluation/_common/raiclient/aio/_client.py +130 -0
- azure/ai/evaluation/_common/raiclient/aio/_configuration.py +87 -0
- azure/ai/evaluation/_common/raiclient/aio/_patch.py +20 -0
- azure/ai/evaluation/_common/raiclient/aio/operations/__init__.py +25 -0
- azure/ai/evaluation/_common/raiclient/aio/operations/_operations.py +981 -0
- azure/ai/evaluation/_common/raiclient/aio/operations/_patch.py +20 -0
- azure/ai/evaluation/_common/raiclient/models/__init__.py +60 -0
- azure/ai/evaluation/_common/raiclient/models/_enums.py +18 -0
- azure/ai/evaluation/_common/raiclient/models/_models.py +651 -0
- azure/ai/evaluation/_common/raiclient/models/_patch.py +20 -0
- azure/ai/evaluation/_common/raiclient/operations/__init__.py +25 -0
- azure/ai/evaluation/_common/raiclient/operations/_operations.py +1238 -0
- azure/ai/evaluation/_common/raiclient/operations/_patch.py +20 -0
- azure/ai/evaluation/_common/raiclient/py.typed +1 -0
- azure/ai/evaluation/_common/utils.py +870 -34
- azure/ai/evaluation/_constants.py +167 -6
- azure/ai/evaluation/_converters/__init__.py +3 -0
- azure/ai/evaluation/_converters/_ai_services.py +899 -0
- azure/ai/evaluation/_converters/_models.py +467 -0
- azure/ai/evaluation/_converters/_sk_services.py +495 -0
- azure/ai/evaluation/_eval_mapping.py +83 -0
- azure/ai/evaluation/_evaluate/_batch_run/__init__.py +17 -0
- azure/ai/evaluation/_evaluate/_batch_run/_run_submitter_client.py +176 -0
- azure/ai/evaluation/_evaluate/_batch_run/batch_clients.py +82 -0
- azure/ai/evaluation/_evaluate/{_batch_run_client → _batch_run}/code_client.py +47 -25
- azure/ai/evaluation/_evaluate/{_batch_run_client/batch_run_context.py → _batch_run/eval_run_context.py} +42 -13
- azure/ai/evaluation/_evaluate/_batch_run/proxy_client.py +124 -0
- azure/ai/evaluation/_evaluate/_batch_run/target_run_context.py +62 -0
- azure/ai/evaluation/_evaluate/_eval_run.py +102 -59
- azure/ai/evaluation/_evaluate/_evaluate.py +2134 -311
- azure/ai/evaluation/_evaluate/_evaluate_aoai.py +992 -0
- azure/ai/evaluation/_evaluate/_telemetry/__init__.py +14 -99
- azure/ai/evaluation/_evaluate/_utils.py +289 -40
- azure/ai/evaluation/_evaluator_definition.py +76 -0
- azure/ai/evaluation/_evaluators/_bleu/_bleu.py +93 -42
- azure/ai/evaluation/_evaluators/_code_vulnerability/__init__.py +5 -0
- azure/ai/evaluation/_evaluators/_code_vulnerability/_code_vulnerability.py +119 -0
- azure/ai/evaluation/_evaluators/_coherence/_coherence.py +117 -91
- azure/ai/evaluation/_evaluators/_coherence/coherence.prompty +76 -39
- azure/ai/evaluation/_evaluators/_common/__init__.py +15 -0
- azure/ai/evaluation/_evaluators/_common/_base_eval.py +742 -0
- azure/ai/evaluation/_evaluators/_common/_base_multi_eval.py +63 -0
- azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +345 -0
- azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py +198 -0
- azure/ai/evaluation/_evaluators/_common/_conversation_aggregators.py +49 -0
- azure/ai/evaluation/_evaluators/_content_safety/__init__.py +0 -4
- azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py +144 -86
- azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py +138 -57
- azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py +123 -55
- azure/ai/evaluation/_evaluators/_content_safety/_sexual.py +133 -54
- azure/ai/evaluation/_evaluators/_content_safety/_violence.py +134 -54
- azure/ai/evaluation/_evaluators/_document_retrieval/__init__.py +7 -0
- azure/ai/evaluation/_evaluators/_document_retrieval/_document_retrieval.py +442 -0
- azure/ai/evaluation/_evaluators/_eci/_eci.py +49 -56
- azure/ai/evaluation/_evaluators/_f1_score/_f1_score.py +102 -60
- azure/ai/evaluation/_evaluators/_fluency/_fluency.py +115 -92
- azure/ai/evaluation/_evaluators/_fluency/fluency.prompty +66 -41
- azure/ai/evaluation/_evaluators/_gleu/_gleu.py +90 -37
- azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +318 -82
- azure/ai/evaluation/_evaluators/_groundedness/groundedness_with_query.prompty +114 -0
- azure/ai/evaluation/_evaluators/_groundedness/groundedness_without_query.prompty +104 -0
- azure/ai/evaluation/{_evaluate/_batch_run_client → _evaluators/_intent_resolution}/__init__.py +3 -4
- azure/ai/evaluation/_evaluators/_intent_resolution/_intent_resolution.py +196 -0
- azure/ai/evaluation/_evaluators/_intent_resolution/intent_resolution.prompty +275 -0
- azure/ai/evaluation/_evaluators/_meteor/_meteor.py +107 -61
- azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py +104 -77
- azure/ai/evaluation/_evaluators/_qa/_qa.py +115 -63
- azure/ai/evaluation/_evaluators/_relevance/_relevance.py +182 -98
- azure/ai/evaluation/_evaluators/_relevance/relevance.prompty +178 -49
- azure/ai/evaluation/_evaluators/_response_completeness/__init__.py +7 -0
- azure/ai/evaluation/_evaluators/_response_completeness/_response_completeness.py +202 -0
- azure/ai/evaluation/_evaluators/_response_completeness/response_completeness.prompty +84 -0
- azure/ai/evaluation/_evaluators/{_chat/retrieval → _retrieval}/__init__.py +2 -2
- azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py +148 -0
- azure/ai/evaluation/_evaluators/_retrieval/retrieval.prompty +93 -0
- azure/ai/evaluation/_evaluators/_rouge/_rouge.py +189 -50
- azure/ai/evaluation/_evaluators/_service_groundedness/__init__.py +9 -0
- azure/ai/evaluation/_evaluators/_service_groundedness/_service_groundedness.py +179 -0
- azure/ai/evaluation/_evaluators/_similarity/_similarity.py +102 -91
- azure/ai/evaluation/_evaluators/_similarity/similarity.prompty +0 -5
- azure/ai/evaluation/_evaluators/_task_adherence/__init__.py +7 -0
- azure/ai/evaluation/_evaluators/_task_adherence/_task_adherence.py +226 -0
- azure/ai/evaluation/_evaluators/_task_adherence/task_adherence.prompty +101 -0
- azure/ai/evaluation/_evaluators/_task_completion/__init__.py +7 -0
- azure/ai/evaluation/_evaluators/_task_completion/_task_completion.py +177 -0
- azure/ai/evaluation/_evaluators/_task_completion/task_completion.prompty +220 -0
- azure/ai/evaluation/_evaluators/_task_navigation_efficiency/__init__.py +7 -0
- azure/ai/evaluation/_evaluators/_task_navigation_efficiency/_task_navigation_efficiency.py +384 -0
- azure/ai/evaluation/_evaluators/_tool_call_accuracy/__init__.py +9 -0
- azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py +298 -0
- azure/ai/evaluation/_evaluators/_tool_call_accuracy/tool_call_accuracy.prompty +166 -0
- azure/ai/evaluation/_evaluators/_tool_input_accuracy/__init__.py +9 -0
- azure/ai/evaluation/_evaluators/_tool_input_accuracy/_tool_input_accuracy.py +263 -0
- azure/ai/evaluation/_evaluators/_tool_input_accuracy/tool_input_accuracy.prompty +76 -0
- azure/ai/evaluation/_evaluators/_tool_output_utilization/__init__.py +7 -0
- azure/ai/evaluation/_evaluators/_tool_output_utilization/_tool_output_utilization.py +225 -0
- azure/ai/evaluation/_evaluators/_tool_output_utilization/tool_output_utilization.prompty +221 -0
- azure/ai/evaluation/_evaluators/_tool_selection/__init__.py +9 -0
- azure/ai/evaluation/_evaluators/_tool_selection/_tool_selection.py +266 -0
- azure/ai/evaluation/_evaluators/_tool_selection/tool_selection.prompty +104 -0
- azure/ai/evaluation/_evaluators/_tool_success/__init__.py +7 -0
- azure/ai/evaluation/_evaluators/_tool_success/_tool_success.py +301 -0
- azure/ai/evaluation/_evaluators/_tool_success/tool_success.prompty +321 -0
- azure/ai/evaluation/_evaluators/_ungrounded_attributes/__init__.py +5 -0
- azure/ai/evaluation/_evaluators/_ungrounded_attributes/_ungrounded_attributes.py +102 -0
- azure/ai/evaluation/_evaluators/_xpia/xpia.py +109 -107
- azure/ai/evaluation/_exceptions.py +51 -7
- azure/ai/evaluation/_http_utils.py +210 -137
- azure/ai/evaluation/_legacy/__init__.py +3 -0
- azure/ai/evaluation/_legacy/_adapters/__init__.py +7 -0
- azure/ai/evaluation/_legacy/_adapters/_check.py +17 -0
- azure/ai/evaluation/_legacy/_adapters/_configuration.py +45 -0
- azure/ai/evaluation/_legacy/_adapters/_constants.py +10 -0
- azure/ai/evaluation/_legacy/_adapters/_errors.py +29 -0
- azure/ai/evaluation/_legacy/_adapters/_flows.py +28 -0
- azure/ai/evaluation/_legacy/_adapters/_service.py +16 -0
- azure/ai/evaluation/_legacy/_adapters/client.py +51 -0
- azure/ai/evaluation/_legacy/_adapters/entities.py +26 -0
- azure/ai/evaluation/_legacy/_adapters/tracing.py +28 -0
- azure/ai/evaluation/_legacy/_adapters/types.py +15 -0
- azure/ai/evaluation/_legacy/_adapters/utils.py +31 -0
- azure/ai/evaluation/_legacy/_batch_engine/__init__.py +9 -0
- azure/ai/evaluation/_legacy/_batch_engine/_config.py +48 -0
- azure/ai/evaluation/_legacy/_batch_engine/_engine.py +477 -0
- azure/ai/evaluation/_legacy/_batch_engine/_exceptions.py +88 -0
- azure/ai/evaluation/_legacy/_batch_engine/_openai_injector.py +132 -0
- azure/ai/evaluation/_legacy/_batch_engine/_result.py +107 -0
- azure/ai/evaluation/_legacy/_batch_engine/_run.py +127 -0
- azure/ai/evaluation/_legacy/_batch_engine/_run_storage.py +128 -0
- azure/ai/evaluation/_legacy/_batch_engine/_run_submitter.py +262 -0
- azure/ai/evaluation/_legacy/_batch_engine/_status.py +25 -0
- azure/ai/evaluation/_legacy/_batch_engine/_trace.py +97 -0
- azure/ai/evaluation/_legacy/_batch_engine/_utils.py +97 -0
- azure/ai/evaluation/_legacy/_batch_engine/_utils_deprecated.py +131 -0
- azure/ai/evaluation/_legacy/_common/__init__.py +3 -0
- azure/ai/evaluation/_legacy/_common/_async_token_provider.py +117 -0
- azure/ai/evaluation/_legacy/_common/_logging.py +292 -0
- azure/ai/evaluation/_legacy/_common/_thread_pool_executor_with_context.py +17 -0
- azure/ai/evaluation/_legacy/prompty/__init__.py +36 -0
- azure/ai/evaluation/_legacy/prompty/_connection.py +119 -0
- azure/ai/evaluation/_legacy/prompty/_exceptions.py +139 -0
- azure/ai/evaluation/_legacy/prompty/_prompty.py +430 -0
- azure/ai/evaluation/_legacy/prompty/_utils.py +663 -0
- azure/ai/evaluation/_legacy/prompty/_yaml_utils.py +99 -0
- azure/ai/evaluation/_model_configurations.py +130 -8
- azure/ai/evaluation/_safety_evaluation/__init__.py +3 -0
- azure/ai/evaluation/_safety_evaluation/_generated_rai_client.py +0 -0
- azure/ai/evaluation/_safety_evaluation/_safety_evaluation.py +917 -0
- azure/ai/evaluation/_user_agent.py +32 -1
- azure/ai/evaluation/_vendor/__init__.py +3 -0
- azure/ai/evaluation/_vendor/rouge_score/__init__.py +14 -0
- azure/ai/evaluation/_vendor/rouge_score/rouge_scorer.py +324 -0
- azure/ai/evaluation/_vendor/rouge_score/scoring.py +59 -0
- azure/ai/evaluation/_vendor/rouge_score/tokenize.py +59 -0
- azure/ai/evaluation/_vendor/rouge_score/tokenizers.py +53 -0
- azure/ai/evaluation/_version.py +2 -1
- azure/ai/evaluation/red_team/__init__.py +22 -0
- azure/ai/evaluation/red_team/_agent/__init__.py +3 -0
- azure/ai/evaluation/red_team/_agent/_agent_functions.py +261 -0
- azure/ai/evaluation/red_team/_agent/_agent_tools.py +461 -0
- azure/ai/evaluation/red_team/_agent/_agent_utils.py +89 -0
- azure/ai/evaluation/red_team/_agent/_semantic_kernel_plugin.py +228 -0
- azure/ai/evaluation/red_team/_attack_objective_generator.py +268 -0
- azure/ai/evaluation/red_team/_attack_strategy.py +49 -0
- azure/ai/evaluation/red_team/_callback_chat_target.py +115 -0
- azure/ai/evaluation/red_team/_default_converter.py +21 -0
- azure/ai/evaluation/red_team/_evaluation_processor.py +505 -0
- azure/ai/evaluation/red_team/_mlflow_integration.py +430 -0
- azure/ai/evaluation/red_team/_orchestrator_manager.py +803 -0
- azure/ai/evaluation/red_team/_red_team.py +1717 -0
- azure/ai/evaluation/red_team/_red_team_result.py +661 -0
- azure/ai/evaluation/red_team/_result_processor.py +1708 -0
- azure/ai/evaluation/red_team/_utils/__init__.py +37 -0
- azure/ai/evaluation/red_team/_utils/_rai_service_eval_chat_target.py +128 -0
- azure/ai/evaluation/red_team/_utils/_rai_service_target.py +601 -0
- azure/ai/evaluation/red_team/_utils/_rai_service_true_false_scorer.py +114 -0
- azure/ai/evaluation/red_team/_utils/constants.py +72 -0
- azure/ai/evaluation/red_team/_utils/exception_utils.py +345 -0
- azure/ai/evaluation/red_team/_utils/file_utils.py +266 -0
- azure/ai/evaluation/red_team/_utils/formatting_utils.py +365 -0
- azure/ai/evaluation/red_team/_utils/logging_utils.py +139 -0
- azure/ai/evaluation/red_team/_utils/metric_mapping.py +73 -0
- azure/ai/evaluation/red_team/_utils/objective_utils.py +46 -0
- azure/ai/evaluation/red_team/_utils/progress_utils.py +252 -0
- azure/ai/evaluation/red_team/_utils/retry_utils.py +218 -0
- azure/ai/evaluation/red_team/_utils/strategy_utils.py +218 -0
- azure/ai/evaluation/simulator/__init__.py +2 -1
- azure/ai/evaluation/simulator/_adversarial_scenario.py +26 -1
- azure/ai/evaluation/simulator/_adversarial_simulator.py +270 -144
- azure/ai/evaluation/simulator/_constants.py +12 -1
- azure/ai/evaluation/simulator/_conversation/__init__.py +151 -23
- azure/ai/evaluation/simulator/_conversation/_conversation.py +10 -6
- azure/ai/evaluation/simulator/_conversation/constants.py +1 -1
- azure/ai/evaluation/simulator/_data_sources/__init__.py +3 -0
- azure/ai/evaluation/simulator/_data_sources/grounding.json +1150 -0
- azure/ai/evaluation/simulator/_direct_attack_simulator.py +54 -75
- azure/ai/evaluation/simulator/_helpers/__init__.py +1 -2
- azure/ai/evaluation/simulator/_helpers/_language_suffix_mapping.py +1 -0
- azure/ai/evaluation/simulator/_helpers/_simulator_data_classes.py +26 -5
- azure/ai/evaluation/simulator/_indirect_attack_simulator.py +145 -104
- azure/ai/evaluation/simulator/_model_tools/__init__.py +2 -1
- azure/ai/evaluation/simulator/_model_tools/_generated_rai_client.py +225 -0
- azure/ai/evaluation/simulator/_model_tools/_identity_manager.py +80 -30
- azure/ai/evaluation/simulator/_model_tools/_proxy_completion_model.py +117 -45
- azure/ai/evaluation/simulator/_model_tools/_rai_client.py +109 -7
- azure/ai/evaluation/simulator/_model_tools/_template_handler.py +97 -33
- azure/ai/evaluation/simulator/_model_tools/models.py +30 -27
- azure/ai/evaluation/simulator/_prompty/task_query_response.prompty +6 -10
- azure/ai/evaluation/simulator/_prompty/task_simulate.prompty +6 -5
- azure/ai/evaluation/simulator/_simulator.py +302 -208
- azure/ai/evaluation/simulator/_utils.py +31 -13
- azure_ai_evaluation-1.13.3.dist-info/METADATA +939 -0
- azure_ai_evaluation-1.13.3.dist-info/RECORD +305 -0
- {azure_ai_evaluation-1.0.0b2.dist-info → azure_ai_evaluation-1.13.3.dist-info}/WHEEL +1 -1
- azure_ai_evaluation-1.13.3.dist-info/licenses/NOTICE.txt +70 -0
- azure/ai/evaluation/_evaluate/_batch_run_client/proxy_client.py +0 -71
- azure/ai/evaluation/_evaluators/_chat/_chat.py +0 -357
- azure/ai/evaluation/_evaluators/_chat/retrieval/_retrieval.py +0 -157
- azure/ai/evaluation/_evaluators/_chat/retrieval/retrieval.prompty +0 -48
- azure/ai/evaluation/_evaluators/_content_safety/_content_safety_base.py +0 -65
- azure/ai/evaluation/_evaluators/_content_safety/_content_safety_chat.py +0 -301
- azure/ai/evaluation/_evaluators/_groundedness/groundedness.prompty +0 -54
- azure/ai/evaluation/_evaluators/_protected_materials/__init__.py +0 -5
- azure/ai/evaluation/_evaluators/_protected_materials/_protected_materials.py +0 -104
- azure/ai/evaluation/simulator/_tracing.py +0 -89
- azure_ai_evaluation-1.0.0b2.dist-info/METADATA +0 -449
- azure_ai_evaluation-1.0.0b2.dist-info/RECORD +0 -99
- {azure_ai_evaluation-1.0.0b2.dist-info → azure_ai_evaluation-1.13.3.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
# ---------------------------------------------------------
|
|
2
|
+
# Copyright (c) Microsoft Corporation. All rights reserved.
|
|
3
|
+
# ---------------------------------------------------------
|
|
4
|
+
|
|
5
|
+
from typing_extensions import TypeAlias
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
try:
|
|
9
|
+
from promptflow._sdk.entities import Run as _Run
|
|
10
|
+
except ImportError:
|
|
11
|
+
from typing_extensions import Protocol
|
|
12
|
+
from typing import Any, Dict, Optional
|
|
13
|
+
from datetime import datetime
|
|
14
|
+
from pathlib import Path
|
|
15
|
+
|
|
16
|
+
class _Run(Protocol):
|
|
17
|
+
name: str
|
|
18
|
+
status: str
|
|
19
|
+
_properties: Dict[str, Any]
|
|
20
|
+
_created_on: datetime
|
|
21
|
+
_end_time: Optional[datetime]
|
|
22
|
+
_experiment_name: Optional[str]
|
|
23
|
+
_output_path: Path
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
Run: TypeAlias = _Run
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
# ---------------------------------------------------------
|
|
2
|
+
# Copyright (c) Microsoft Corporation. All rights reserved.
|
|
3
|
+
# ---------------------------------------------------------
|
|
4
|
+
|
|
5
|
+
from typing import Callable, Dict, Final, Optional
|
|
6
|
+
from typing_extensions import TypeAlias
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
try:
|
|
10
|
+
from promptflow.tracing import ThreadPoolExecutorWithContext as _ThreadPoolExecutorWithContext
|
|
11
|
+
from promptflow.tracing._integrations._openai_injector import (
|
|
12
|
+
inject_openai_api as _inject,
|
|
13
|
+
recover_openai_api as _recover,
|
|
14
|
+
)
|
|
15
|
+
from promptflow.tracing import _start_trace
|
|
16
|
+
except ImportError:
|
|
17
|
+
from concurrent.futures import ThreadPoolExecutor as _ThreadPoolExecutorWithContext
|
|
18
|
+
from azure.ai.evaluation._legacy._batch_engine._openai_injector import (
|
|
19
|
+
inject_openai_api as _inject,
|
|
20
|
+
recover_openai_api as _recover,
|
|
21
|
+
)
|
|
22
|
+
from azure.ai.evaluation._legacy._batch_engine._trace import start_trace as _start_trace
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
ThreadPoolExecutorWithContext: TypeAlias = _ThreadPoolExecutorWithContext
|
|
26
|
+
inject_openai_api: Final[Callable[[], None]] = _inject
|
|
27
|
+
recover_openai_api: Final[Callable[[], None]] = _recover
|
|
28
|
+
start_trace: Final = _start_trace
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
# ---------------------------------------------------------
|
|
2
|
+
# Copyright (c) Microsoft Corporation. All rights reserved.
|
|
3
|
+
# ---------------------------------------------------------
|
|
4
|
+
|
|
5
|
+
from typing import Any
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class AttrDict(dict):
|
|
9
|
+
"""A dictionary that allows attribute access to its keys."""
|
|
10
|
+
|
|
11
|
+
def __getattr__(self, key: str) -> Any:
|
|
12
|
+
return self[key]
|
|
13
|
+
|
|
14
|
+
def __setattr__(self, key: str, value: Any) -> None:
|
|
15
|
+
self[key] = value
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
# ---------------------------------------------------------
|
|
2
|
+
# Copyright (c) Microsoft Corporation. All rights reserved.
|
|
3
|
+
# ---------------------------------------------------------
|
|
4
|
+
|
|
5
|
+
from typing import Final, Optional
|
|
6
|
+
from typing_extensions import TypeAlias
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
try:
|
|
10
|
+
from promptflow._utils.user_agent_utils import ClientUserAgentUtil as _ClientUserAgentUtil
|
|
11
|
+
from promptflow._utils.async_utils import async_run_allowing_running_loop as _async_run_allowing_running_loop
|
|
12
|
+
from promptflow._cli._utils import get_workspace_triad_from_local as _get_workspace_triad_from_local
|
|
13
|
+
except ImportError:
|
|
14
|
+
from azure.ai.evaluation._legacy._batch_engine._utils_deprecated import (
|
|
15
|
+
async_run_allowing_running_loop as _async_run_allowing_running_loop,
|
|
16
|
+
)
|
|
17
|
+
from azure.ai.evaluation._evaluate._utils import AzureMLWorkspace
|
|
18
|
+
|
|
19
|
+
class _ClientUserAgentUtil:
|
|
20
|
+
@staticmethod
|
|
21
|
+
def append_user_agent(user_agent: Optional[str]):
|
|
22
|
+
# TODO ralphe: implement?
|
|
23
|
+
pass
|
|
24
|
+
|
|
25
|
+
def _get_workspace_triad_from_local() -> AzureMLWorkspace:
|
|
26
|
+
return AzureMLWorkspace("", "", "")
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
ClientUserAgentUtil: TypeAlias = _ClientUserAgentUtil
|
|
30
|
+
async_run_allowing_running_loop: Final = _async_run_allowing_running_loop
|
|
31
|
+
get_workspace_triad_from_local: Final = _get_workspace_triad_from_local
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
# ---------------------------------------------------------
|
|
2
|
+
# Copyright (c) Microsoft Corporation. All rights reserved.
|
|
3
|
+
# ---------------------------------------------------------
|
|
4
|
+
|
|
5
|
+
# NOTE: This is a direct port of the bare minimum needed for BatchEngine functionality from
|
|
6
|
+
# the original Promptflow code. The goal here is expediency, not elegance. As such
|
|
7
|
+
# parts of this code may be a little "quirky", seem incomplete in places, or contain
|
|
8
|
+
# longer TODOs comments than usual. In a future code update, large swaths of this code
|
|
9
|
+
# will be refactored or deleted outright.
|
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
# ---------------------------------------------------------
|
|
2
|
+
# Copyright (c) Microsoft Corporation. All rights reserved.
|
|
3
|
+
# ---------------------------------------------------------
|
|
4
|
+
|
|
5
|
+
from dataclasses import dataclass
|
|
6
|
+
from logging import Logger
|
|
7
|
+
|
|
8
|
+
from ..._constants import PF_BATCH_TIMEOUT_SEC_DEFAULT
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
@dataclass
|
|
12
|
+
class BatchEngineConfig:
|
|
13
|
+
"""Context for a batch of evaluations. This will contain the configuration,
|
|
14
|
+
logging, and other needed information."""
|
|
15
|
+
|
|
16
|
+
logger: Logger
|
|
17
|
+
"""The logger to use for logging messages."""
|
|
18
|
+
|
|
19
|
+
batch_timeout_seconds: int = PF_BATCH_TIMEOUT_SEC_DEFAULT
|
|
20
|
+
"""The maximum amount of time to wait for all evaluations in the batch to complete."""
|
|
21
|
+
|
|
22
|
+
line_timeout_seconds: int = 600
|
|
23
|
+
"""The maximum amount of time to wait for an evaluation to run against a single entry
|
|
24
|
+
in the data input to complete."""
|
|
25
|
+
|
|
26
|
+
max_concurrency: int = 10
|
|
27
|
+
"""The maximum number of evaluations to run concurrently."""
|
|
28
|
+
|
|
29
|
+
use_async: bool = True
|
|
30
|
+
"""Whether to use asynchronous evaluation."""
|
|
31
|
+
|
|
32
|
+
default_num_results: int = 100
|
|
33
|
+
"""The default number of results to return if you don't ask for all results."""
|
|
34
|
+
|
|
35
|
+
raise_on_error: bool = True
|
|
36
|
+
"""Whether to raise an error if an evaluation fails."""
|
|
37
|
+
|
|
38
|
+
def __post_init__(self):
|
|
39
|
+
if self.logger is None:
|
|
40
|
+
raise ValueError("logger cannot be None")
|
|
41
|
+
if self.batch_timeout_seconds <= 0:
|
|
42
|
+
raise ValueError("batch_timeout_seconds must be greater than 0")
|
|
43
|
+
if self.line_timeout_seconds <= 0:
|
|
44
|
+
raise ValueError("line_timeout_seconds must be greater than 0")
|
|
45
|
+
if self.max_concurrency <= 0:
|
|
46
|
+
raise ValueError("max_concurrency must be greater than 0")
|
|
47
|
+
if self.default_num_results <= 0:
|
|
48
|
+
raise ValueError("default_num_results must be greater than 0")
|
|
@@ -0,0 +1,477 @@
|
|
|
1
|
+
# ---------------------------------------------------------
|
|
2
|
+
# Copyright (c) Microsoft Corporation. All rights reserved.
|
|
3
|
+
# ---------------------------------------------------------
|
|
4
|
+
|
|
5
|
+
# This contains code merged together from the following files:
|
|
6
|
+
# promptflow-devkit/promptflow/batch/_batch_engine.py
|
|
7
|
+
# promptflow-devkit/promptflow/_proxy/_python_executor_proxy.py
|
|
8
|
+
# promptflow-core/promptflow/executor/_script_executor.py
|
|
9
|
+
# TODO ralphe: The way this code does batch execution needs to be improved. For now
|
|
10
|
+
# porting over the code largely as is to remove the Promptflow dependency
|
|
11
|
+
# as quickly as possible. In phase 2 this code will be heavily refactored.
|
|
12
|
+
|
|
13
|
+
import inspect
|
|
14
|
+
import re
|
|
15
|
+
import asyncio
|
|
16
|
+
|
|
17
|
+
from math import floor
|
|
18
|
+
from asyncio import Semaphore
|
|
19
|
+
from concurrent.futures import Executor
|
|
20
|
+
from functools import partial
|
|
21
|
+
from contextlib import contextmanager
|
|
22
|
+
from datetime import datetime, timezone
|
|
23
|
+
from typing import (
|
|
24
|
+
Any,
|
|
25
|
+
Callable,
|
|
26
|
+
Dict,
|
|
27
|
+
Final,
|
|
28
|
+
Generator,
|
|
29
|
+
List,
|
|
30
|
+
Mapping,
|
|
31
|
+
MutableMapping,
|
|
32
|
+
Optional,
|
|
33
|
+
Sequence,
|
|
34
|
+
Set,
|
|
35
|
+
Tuple,
|
|
36
|
+
cast,
|
|
37
|
+
Literal,
|
|
38
|
+
)
|
|
39
|
+
from uuid import uuid4
|
|
40
|
+
|
|
41
|
+
from ._config import BatchEngineConfig
|
|
42
|
+
from ._utils import DEFAULTS_KEY, get_int_env_var, get_value_from_path, is_async_callable
|
|
43
|
+
from ._status import BatchStatus
|
|
44
|
+
from ._result import BatchResult, BatchRunDetails, BatchRunError, TokenMetrics
|
|
45
|
+
from ._run_storage import AbstractRunStorage, NoOpRunStorage
|
|
46
|
+
from .._common._logging import log_progress, logger, NodeLogManager
|
|
47
|
+
from ..._exceptions import ErrorBlame, EvaluationException
|
|
48
|
+
from ._exceptions import (
|
|
49
|
+
BatchEngineCanceledError,
|
|
50
|
+
BatchEngineError,
|
|
51
|
+
BatchEngineRunFailedError,
|
|
52
|
+
BatchEngineTimeoutError,
|
|
53
|
+
BatchEngineValidationError,
|
|
54
|
+
)
|
|
55
|
+
from ._utils_deprecated import (
|
|
56
|
+
async_run_allowing_running_loop,
|
|
57
|
+
convert_eager_flow_output_to_dict,
|
|
58
|
+
)
|
|
59
|
+
from ._openai_injector import CaptureOpenAITokenUsage
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
MAX_WORKER_COUNT: Final[int] = 10
|
|
63
|
+
KEYWORD_PATTERN: Final = re.compile(r"^\${([^{}]+)}$")
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
class BatchEngine:
|
|
67
|
+
"""This class is used to execute flows in batch mode"""
|
|
68
|
+
|
|
69
|
+
def __init__(
|
|
70
|
+
self,
|
|
71
|
+
func: Callable,
|
|
72
|
+
*,
|
|
73
|
+
config: BatchEngineConfig,
|
|
74
|
+
storage: Optional[AbstractRunStorage] = None,
|
|
75
|
+
executor: Optional[Executor] = None,
|
|
76
|
+
):
|
|
77
|
+
"""Create a new batch engine instance
|
|
78
|
+
|
|
79
|
+
:param Callable func: The function to run the flow
|
|
80
|
+
:param BatchEngineConfig config: The configuration for the batch engine
|
|
81
|
+
:param Optional[AbstractRunStorage] storage: The storage to store execution results
|
|
82
|
+
:param Optional[Executor] executor: The executor to run the flow (if needed)
|
|
83
|
+
"""
|
|
84
|
+
|
|
85
|
+
self._func: Callable = func
|
|
86
|
+
self._config: BatchEngineConfig = config
|
|
87
|
+
self._storage: AbstractRunStorage = storage or NoOpRunStorage()
|
|
88
|
+
|
|
89
|
+
self._batch_timeout_sec = self._config.batch_timeout_seconds
|
|
90
|
+
self._line_timeout_sec = self._config.line_timeout_seconds
|
|
91
|
+
self._max_worker_count = self._config.max_concurrency
|
|
92
|
+
|
|
93
|
+
self._executor: Optional[Executor] = executor
|
|
94
|
+
self._is_canceled: bool = False
|
|
95
|
+
|
|
96
|
+
async def run(
|
|
97
|
+
self,
|
|
98
|
+
data: Sequence[Mapping[str, Any]],
|
|
99
|
+
column_mapping: Optional[Mapping[str, str]],
|
|
100
|
+
*,
|
|
101
|
+
id: Optional[str] = None,
|
|
102
|
+
max_lines: Optional[int] = None,
|
|
103
|
+
) -> BatchResult:
|
|
104
|
+
if not data:
|
|
105
|
+
raise BatchEngineValidationError("Please provide a non-empty data mapping.")
|
|
106
|
+
|
|
107
|
+
start_time = datetime.now(timezone.utc)
|
|
108
|
+
|
|
109
|
+
batch_inputs = self._apply_column_mapping(data, column_mapping, max_lines)
|
|
110
|
+
if not batch_inputs or all(len(data) == 0 for data in batch_inputs):
|
|
111
|
+
raise BatchEngineValidationError("No data to process.")
|
|
112
|
+
|
|
113
|
+
try:
|
|
114
|
+
id = id or str(uuid4())
|
|
115
|
+
result: BatchResult = await self._exec_in_task(id, batch_inputs, start_time)
|
|
116
|
+
return result
|
|
117
|
+
except EvaluationException:
|
|
118
|
+
raise
|
|
119
|
+
except Exception as ex:
|
|
120
|
+
raise BatchEngineError(
|
|
121
|
+
"Unexpected error while running the batch run.", blame=ErrorBlame.SYSTEM_ERROR
|
|
122
|
+
) from ex
|
|
123
|
+
|
|
124
|
+
def cancel(self):
|
|
125
|
+
# TODO ralphe: Make sure this works
|
|
126
|
+
self._is_canceled = True
|
|
127
|
+
|
|
128
|
+
def _apply_column_mapping(
|
|
129
|
+
self,
|
|
130
|
+
data: Sequence[Mapping[str, Any]],
|
|
131
|
+
column_mapping: Optional[Mapping[str, str]],
|
|
132
|
+
max_lines: Optional[int],
|
|
133
|
+
) -> Sequence[Mapping[str, str]]:
|
|
134
|
+
|
|
135
|
+
resolved_column_mapping: Mapping[str, str] = self._resolve_column_mapping(column_mapping)
|
|
136
|
+
resolved_column_mapping.update(self._generate_defaults_for_column_mapping())
|
|
137
|
+
return self._apply_column_mapping_to_lines(data, resolved_column_mapping, max_lines)
|
|
138
|
+
|
|
139
|
+
def _resolve_column_mapping(
|
|
140
|
+
self,
|
|
141
|
+
column_mapping: Optional[Mapping[str, str]],
|
|
142
|
+
) -> Mapping[str, str]:
|
|
143
|
+
parameters = inspect.signature(self._func).parameters
|
|
144
|
+
default_column_mapping: Dict[str, str] = {
|
|
145
|
+
name: f"${{data.{name}}}"
|
|
146
|
+
for name, value in parameters.items()
|
|
147
|
+
if name not in ["self", "cls", "args", "kwargs"]
|
|
148
|
+
}
|
|
149
|
+
resolved_mapping: Dict[str, str] = default_column_mapping.copy()
|
|
150
|
+
|
|
151
|
+
for name, value in parameters.items():
|
|
152
|
+
if value and value.default is not inspect.Parameter.empty:
|
|
153
|
+
resolved_mapping.pop(name)
|
|
154
|
+
|
|
155
|
+
resolved_mapping.update(column_mapping or {})
|
|
156
|
+
return resolved_mapping
|
|
157
|
+
|
|
158
|
+
def _generate_defaults_for_column_mapping(self) -> Mapping[Literal["$defaults$"], Any]:
|
|
159
|
+
|
|
160
|
+
return {
|
|
161
|
+
DEFAULTS_KEY: {
|
|
162
|
+
name: value.default
|
|
163
|
+
for name, value in inspect.signature(self._func).parameters.items()
|
|
164
|
+
if value.default is not inspect.Parameter.empty
|
|
165
|
+
}
|
|
166
|
+
}
|
|
167
|
+
|
|
168
|
+
@staticmethod
|
|
169
|
+
def _apply_column_mapping_to_lines(
|
|
170
|
+
data: Sequence[Mapping[str, Any]],
|
|
171
|
+
column_mapping: Mapping[str, str],
|
|
172
|
+
max_lines: Optional[int],
|
|
173
|
+
) -> Sequence[Mapping[str, Any]]:
|
|
174
|
+
data = data[:max_lines] if max_lines else data
|
|
175
|
+
|
|
176
|
+
inputs: Sequence[Mapping[str, Any]] = []
|
|
177
|
+
defaults = cast(Mapping[str, Any], column_mapping.get(DEFAULTS_KEY, {}))
|
|
178
|
+
|
|
179
|
+
for line_number, input in enumerate(data, start=1):
|
|
180
|
+
mapped: Dict[str, Any] = {}
|
|
181
|
+
missing_inputs: Set[str] = set()
|
|
182
|
+
|
|
183
|
+
for key, value in column_mapping.items():
|
|
184
|
+
if key == DEFAULTS_KEY:
|
|
185
|
+
# Skip the defaults key
|
|
186
|
+
continue
|
|
187
|
+
|
|
188
|
+
if not isinstance(value, str):
|
|
189
|
+
# All non-string values are literal values.
|
|
190
|
+
mapped[key] = value
|
|
191
|
+
continue
|
|
192
|
+
|
|
193
|
+
match: Optional[re.Match[str]] = re.search(KEYWORD_PATTERN, value)
|
|
194
|
+
if match is None:
|
|
195
|
+
# Literal string value value
|
|
196
|
+
mapped[key] = value
|
|
197
|
+
continue
|
|
198
|
+
|
|
199
|
+
dict_path = match.group(1)
|
|
200
|
+
found, mapped_value = get_value_from_path(dict_path, input)
|
|
201
|
+
if not found: # try default value
|
|
202
|
+
found, mapped_value = get_value_from_path(dict_path, defaults)
|
|
203
|
+
|
|
204
|
+
if found:
|
|
205
|
+
mapped[key] = mapped_value
|
|
206
|
+
else:
|
|
207
|
+
missing_inputs.add(dict_path)
|
|
208
|
+
|
|
209
|
+
if missing_inputs:
|
|
210
|
+
missing = ", ".join(missing_inputs)
|
|
211
|
+
raise BatchEngineValidationError(f"Missing inputs for line {line_number}: '{missing}'")
|
|
212
|
+
|
|
213
|
+
inputs.append(mapped)
|
|
214
|
+
|
|
215
|
+
return inputs
|
|
216
|
+
|
|
217
|
+
async def _exec_in_task(
|
|
218
|
+
self, run_id: str, batch_inputs: Sequence[Mapping[str, Any]], start_time: datetime
|
|
219
|
+
) -> BatchResult:
|
|
220
|
+
# Since the batch execution is not guaranteed to be completed in the same order
|
|
221
|
+
# as the inputs, we keep track of these in a mapping from index to result
|
|
222
|
+
results: Dict[int, BatchRunDetails] = {}
|
|
223
|
+
status: BatchStatus = BatchStatus.Completed
|
|
224
|
+
error: Optional[Exception] = None
|
|
225
|
+
|
|
226
|
+
task = asyncio.create_task(self._exec_batch(run_id, batch_inputs, start_time, results))
|
|
227
|
+
|
|
228
|
+
while not task.done():
|
|
229
|
+
# check whether the task is completed or canceled every 1s
|
|
230
|
+
await asyncio.sleep(1)
|
|
231
|
+
if self._is_canceled:
|
|
232
|
+
task.cancel()
|
|
233
|
+
# use current completed line results and aggregation results to create a BatchResult
|
|
234
|
+
status = BatchStatus.Canceled
|
|
235
|
+
error = BatchEngineCanceledError("The batch run is canceled by user.")
|
|
236
|
+
break
|
|
237
|
+
elif self._batch_timeout_expired(start_time):
|
|
238
|
+
task.cancel()
|
|
239
|
+
status = BatchStatus.Failed
|
|
240
|
+
error = BatchEngineTimeoutError(
|
|
241
|
+
f"The batch run failed due to timeout [{self._batch_timeout_sec}s]. "
|
|
242
|
+
f"Please adjust the timeout to a higher value."
|
|
243
|
+
)
|
|
244
|
+
break
|
|
245
|
+
|
|
246
|
+
end_time = datetime.now(timezone.utc)
|
|
247
|
+
metrics = TokenMetrics(0, 0, 0)
|
|
248
|
+
failed_lines: int = 0
|
|
249
|
+
|
|
250
|
+
# generate the details in the same order as the inputs and fill in the missing results
|
|
251
|
+
# with a failed status
|
|
252
|
+
result_details = [
|
|
253
|
+
(
|
|
254
|
+
results[i]
|
|
255
|
+
if i in results
|
|
256
|
+
else BatchRunDetails(
|
|
257
|
+
id=BatchRunDetails.create_id(run_id, i),
|
|
258
|
+
status=BatchStatus.Failed,
|
|
259
|
+
result=None,
|
|
260
|
+
start_time=None,
|
|
261
|
+
end_time=None,
|
|
262
|
+
tokens=TokenMetrics(0, 0, 0),
|
|
263
|
+
error=BatchRunError("The line run is not completed.", None),
|
|
264
|
+
index=i,
|
|
265
|
+
)
|
|
266
|
+
)
|
|
267
|
+
for i in range(len(batch_inputs))
|
|
268
|
+
]
|
|
269
|
+
self.handle_line_failures(result_details)
|
|
270
|
+
|
|
271
|
+
for line_result in result_details:
|
|
272
|
+
# Indicate the worst status of the batch run. This works because
|
|
273
|
+
# canceled and failed have a higher value than completed.
|
|
274
|
+
status = max(status, line_result.status)
|
|
275
|
+
if BatchStatus.is_failed(line_result.status):
|
|
276
|
+
failed_lines += 1
|
|
277
|
+
if line_result.tokens:
|
|
278
|
+
metrics.prompt_tokens += line_result.tokens.prompt_tokens
|
|
279
|
+
metrics.completion_tokens += line_result.tokens.completion_tokens
|
|
280
|
+
metrics.total_tokens += line_result.tokens.total_tokens
|
|
281
|
+
|
|
282
|
+
if failed_lines and not error:
|
|
283
|
+
error_message = f"{floor(failed_lines / len(batch_inputs) * 100)}% of the batch run failed."
|
|
284
|
+
first_exception: Optional[Exception] = next(
|
|
285
|
+
(result.error.exception for result in result_details if result.error and result.error.exception),
|
|
286
|
+
None,
|
|
287
|
+
)
|
|
288
|
+
if first_exception is not None:
|
|
289
|
+
error_message += f" {first_exception}"
|
|
290
|
+
|
|
291
|
+
error = BatchEngineRunFailedError(error_message)
|
|
292
|
+
|
|
293
|
+
return BatchResult(
|
|
294
|
+
status=status,
|
|
295
|
+
total_lines=len(batch_inputs),
|
|
296
|
+
failed_lines=failed_lines,
|
|
297
|
+
start_time=start_time,
|
|
298
|
+
end_time=end_time,
|
|
299
|
+
tokens=metrics,
|
|
300
|
+
details=result_details,
|
|
301
|
+
error=error,
|
|
302
|
+
)
|
|
303
|
+
|
|
304
|
+
async def _exec_batch(
|
|
305
|
+
self,
|
|
306
|
+
run_id: str,
|
|
307
|
+
batch_inputs: Sequence[Mapping[str, Any]],
|
|
308
|
+
start_time: datetime,
|
|
309
|
+
results: MutableMapping[int, BatchRunDetails],
|
|
310
|
+
) -> None:
|
|
311
|
+
semaphore: Semaphore = Semaphore(self._max_worker_count)
|
|
312
|
+
|
|
313
|
+
# TODO ralphe: This async code needs to refactored to use e.g. asyncio.gather, or
|
|
314
|
+
# asyncio.as_completed.
|
|
315
|
+
# TODO ralphe: This code needs to handle cancellation better
|
|
316
|
+
async def create_under_semaphore(index: int, inputs: Mapping[str, Any]):
|
|
317
|
+
async with semaphore:
|
|
318
|
+
return await self._exec_line_async(run_id, inputs, index)
|
|
319
|
+
|
|
320
|
+
pending = [
|
|
321
|
+
asyncio.create_task(create_under_semaphore(index, inputs)) for index, inputs in enumerate(batch_inputs)
|
|
322
|
+
]
|
|
323
|
+
|
|
324
|
+
total_lines: int = len(batch_inputs)
|
|
325
|
+
completed_lines: int = 0
|
|
326
|
+
while completed_lines < total_lines:
|
|
327
|
+
# TODO ralphe: Fix this code so it doesn't re-order the outputs
|
|
328
|
+
# wait for any task to complete
|
|
329
|
+
done, pending = await asyncio.wait(pending, return_when=asyncio.FIRST_COMPLETED)
|
|
330
|
+
completed_line_results = [task.result() for task in done]
|
|
331
|
+
# persist node run infos and flow run info in line result to storage
|
|
332
|
+
self._persist_run_info([result for _, result in completed_line_results])
|
|
333
|
+
results.update({index: result for index, result in completed_line_results})
|
|
334
|
+
# update the progress log
|
|
335
|
+
completed_lines += len(completed_line_results)
|
|
336
|
+
log_progress(
|
|
337
|
+
run_start_time=start_time,
|
|
338
|
+
total_count=total_lines,
|
|
339
|
+
current_count=completed_lines,
|
|
340
|
+
# TODO ralphe: set logger to use here
|
|
341
|
+
)
|
|
342
|
+
|
|
343
|
+
def __preprocess_inputs(self, inputs: Mapping[str, Any]) -> Mapping[str, Any]:
|
|
344
|
+
|
|
345
|
+
func_params = inspect.signature(self._func).parameters
|
|
346
|
+
|
|
347
|
+
has_kwargs = any(p.kind == p.VAR_KEYWORD for p in func_params.values())
|
|
348
|
+
|
|
349
|
+
if has_kwargs:
|
|
350
|
+
return inputs
|
|
351
|
+
else:
|
|
352
|
+
filtered_params = {key: value for key, value in inputs.items() if key in func_params}
|
|
353
|
+
return filtered_params
|
|
354
|
+
|
|
355
|
+
async def _exec_line_async(
|
|
356
|
+
self,
|
|
357
|
+
run_id: str,
|
|
358
|
+
inputs: Mapping[str, Any],
|
|
359
|
+
index: int,
|
|
360
|
+
) -> Tuple[int, BatchRunDetails]:
|
|
361
|
+
with self._exec_line_context(run_id, index):
|
|
362
|
+
details: BatchRunDetails = BatchRunDetails(
|
|
363
|
+
id=f"{run_id}_{index}",
|
|
364
|
+
status=BatchStatus.NotStarted,
|
|
365
|
+
result=None,
|
|
366
|
+
start_time=datetime.now(timezone.utc),
|
|
367
|
+
end_time=None,
|
|
368
|
+
tokens=TokenMetrics(0, 0, 0),
|
|
369
|
+
error=None,
|
|
370
|
+
index=index,
|
|
371
|
+
)
|
|
372
|
+
|
|
373
|
+
try:
|
|
374
|
+
# TODO ralphe: Handle line timeouts here
|
|
375
|
+
with CaptureOpenAITokenUsage() as captured_tokens:
|
|
376
|
+
# NOTE: In the legacy code, any synchronous functions were executed in a different process
|
|
377
|
+
# for isolation reasons. However this isolation was violated in the way the code was
|
|
378
|
+
# used by the evaluation SDK (e.g. you need to have the module already loaded to pass the
|
|
379
|
+
# callable into the batch engine, so starting a new process to examine it was redundant).
|
|
380
|
+
# It also came with performance and memory usage costs (each line was processed in a
|
|
381
|
+
# separate process up to a maximum of 4), and these processes were created and torn down
|
|
382
|
+
# too frequently.
|
|
383
|
+
# For now we will just run the function in the current process, but in the future we may
|
|
384
|
+
# want to consider running the function in a separate process for isolation reasons.
|
|
385
|
+
output: Any
|
|
386
|
+
|
|
387
|
+
processed_inputs = self.__preprocess_inputs(inputs)
|
|
388
|
+
if is_async_callable(self._func):
|
|
389
|
+
output = await self._func(**processed_inputs)
|
|
390
|
+
else:
|
|
391
|
+
# to maximize the parallelism, we run the synchronous function in a separate thread
|
|
392
|
+
# and await its result
|
|
393
|
+
output = await asyncio.get_event_loop().run_in_executor(
|
|
394
|
+
self._executor, partial(self._func, **processed_inputs)
|
|
395
|
+
)
|
|
396
|
+
|
|
397
|
+
# This should in theory never happen but as an extra precaution, let's check if the output
|
|
398
|
+
# is awaitable and await it if it is.
|
|
399
|
+
if inspect.isawaitable(output):
|
|
400
|
+
output = await output
|
|
401
|
+
|
|
402
|
+
details.status = BatchStatus.Completed
|
|
403
|
+
details.result = convert_eager_flow_output_to_dict(output)
|
|
404
|
+
details.tokens.update(captured_tokens)
|
|
405
|
+
except Exception as ex:
|
|
406
|
+
details.status = BatchStatus.Failed
|
|
407
|
+
details.error = BatchRunError(
|
|
408
|
+
f"Error while evaluating single input: {ex.__class__.__name__}: {str(ex)}", ex
|
|
409
|
+
)
|
|
410
|
+
finally:
|
|
411
|
+
details.end_time = datetime.now(timezone.utc)
|
|
412
|
+
|
|
413
|
+
return index, details
|
|
414
|
+
|
|
415
|
+
@staticmethod
|
|
416
|
+
def handle_line_failures(run_infos: List[BatchRunDetails], raise_on_line_failure: bool = False):
|
|
417
|
+
"""Handle line failures in batch run"""
|
|
418
|
+
failed_run_infos: List[BatchRunDetails] = [r for r in run_infos if r.status == BatchStatus.Failed]
|
|
419
|
+
failed_msg: Optional[str] = None
|
|
420
|
+
if len(failed_run_infos) > 0:
|
|
421
|
+
failed_indexes = ",".join([str(r.index) for r in failed_run_infos])
|
|
422
|
+
first_fail_exception: str = failed_run_infos[0].error.details
|
|
423
|
+
if raise_on_line_failure:
|
|
424
|
+
failed_msg = "Flow run failed due to the error: " + first_fail_exception
|
|
425
|
+
raise Exception(failed_msg)
|
|
426
|
+
|
|
427
|
+
failed_msg = (
|
|
428
|
+
f"{len(failed_run_infos)}/{len(run_infos)} flow run failed, indexes: [{failed_indexes}],"
|
|
429
|
+
f" exception of index {failed_run_infos[0].index}: {first_fail_exception}"
|
|
430
|
+
)
|
|
431
|
+
logger.error(failed_msg)
|
|
432
|
+
|
|
433
|
+
def _persist_run_info(self, line_results: Sequence[BatchRunDetails]):
|
|
434
|
+
# TODO ralphe: implement?
|
|
435
|
+
pass
|
|
436
|
+
|
|
437
|
+
def _batch_timeout_expired(self, start_time: datetime) -> bool:
|
|
438
|
+
if self._batch_timeout_sec is None:
|
|
439
|
+
return False
|
|
440
|
+
return (datetime.now(timezone.utc) - start_time).total_seconds() > self._batch_timeout_sec
|
|
441
|
+
|
|
442
|
+
@contextmanager
|
|
443
|
+
def _exec_line_context(self, run_id: str, line_number: int) -> Generator[None, Any, None]:
|
|
444
|
+
# TODO ralphe: Do proper tracing and logging here
|
|
445
|
+
log_manager = NodeLogManager()
|
|
446
|
+
log_manager.set_node_context(run_id, "Flex", line_number)
|
|
447
|
+
with log_manager, self._update_operation_context(run_id, line_number):
|
|
448
|
+
yield
|
|
449
|
+
|
|
450
|
+
@contextmanager
|
|
451
|
+
def _update_operation_context(self, run_id: str, line_number: int) -> Generator[None, Any, None]:
|
|
452
|
+
# operation_context = OperationContext.get_instance()
|
|
453
|
+
# original_context = operation_context.copy()
|
|
454
|
+
# original_mode = operation_context.get("run_mode", RunMode.Test.name)
|
|
455
|
+
# values_for_context = {"flow_id": self._flow_id, "root_run_id": run_id}
|
|
456
|
+
# if original_mode == RunMode.Batch.name:
|
|
457
|
+
# values_for_otel = {
|
|
458
|
+
# "batch_run_id": run_id,
|
|
459
|
+
# "line_number": line_number,
|
|
460
|
+
# }
|
|
461
|
+
# else:
|
|
462
|
+
# values_for_otel = {"line_run_id": run_id}
|
|
463
|
+
# try:
|
|
464
|
+
# append_promptflow_package_ua(operation_context)
|
|
465
|
+
# operation_context.set_execution_target(execution_target=self._execution_target)
|
|
466
|
+
# operation_context.set_default_tracing_keys(DEFAULT_TRACING_KEYS)
|
|
467
|
+
# operation_context.run_mode = original_mode
|
|
468
|
+
# operation_context.update(values_for_context)
|
|
469
|
+
# for k, v in values_for_otel.items():
|
|
470
|
+
# operation_context._add_otel_attributes(k, v)
|
|
471
|
+
# # Inject OpenAI API to make sure traces and headers injection works and
|
|
472
|
+
# # update OpenAI API configs from environment variables.
|
|
473
|
+
# inject_openai_api()
|
|
474
|
+
yield
|
|
475
|
+
|
|
476
|
+
# finally:
|
|
477
|
+
# OperationContext.set_instance(original_context)
|