azure-ai-evaluation 1.0.0b2__py3-none-any.whl → 1.13.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of azure-ai-evaluation might be problematic. Click here for more details.
- azure/ai/evaluation/__init__.py +100 -5
- azure/ai/evaluation/{_evaluators/_chat → _aoai}/__init__.py +3 -2
- azure/ai/evaluation/_aoai/aoai_grader.py +140 -0
- azure/ai/evaluation/_aoai/label_grader.py +68 -0
- azure/ai/evaluation/_aoai/python_grader.py +86 -0
- azure/ai/evaluation/_aoai/score_model_grader.py +94 -0
- azure/ai/evaluation/_aoai/string_check_grader.py +66 -0
- azure/ai/evaluation/_aoai/text_similarity_grader.py +80 -0
- azure/ai/evaluation/_azure/__init__.py +3 -0
- azure/ai/evaluation/_azure/_clients.py +204 -0
- azure/ai/evaluation/_azure/_envs.py +207 -0
- azure/ai/evaluation/_azure/_models.py +227 -0
- azure/ai/evaluation/_azure/_token_manager.py +129 -0
- azure/ai/evaluation/_common/__init__.py +9 -1
- azure/ai/evaluation/{simulator/_helpers → _common}/_experimental.py +24 -9
- azure/ai/evaluation/_common/constants.py +131 -2
- azure/ai/evaluation/_common/evaluation_onedp_client.py +169 -0
- azure/ai/evaluation/_common/math.py +89 -0
- azure/ai/evaluation/_common/onedp/__init__.py +32 -0
- azure/ai/evaluation/_common/onedp/_client.py +166 -0
- azure/ai/evaluation/_common/onedp/_configuration.py +72 -0
- azure/ai/evaluation/_common/onedp/_model_base.py +1232 -0
- azure/ai/evaluation/_common/onedp/_patch.py +21 -0
- azure/ai/evaluation/_common/onedp/_serialization.py +2032 -0
- azure/ai/evaluation/_common/onedp/_types.py +21 -0
- azure/ai/evaluation/_common/onedp/_utils/__init__.py +6 -0
- azure/ai/evaluation/_common/onedp/_utils/model_base.py +1232 -0
- azure/ai/evaluation/_common/onedp/_utils/serialization.py +2032 -0
- azure/ai/evaluation/_common/onedp/_validation.py +66 -0
- azure/ai/evaluation/_common/onedp/_vendor.py +50 -0
- azure/ai/evaluation/_common/onedp/_version.py +9 -0
- azure/ai/evaluation/_common/onedp/aio/__init__.py +29 -0
- azure/ai/evaluation/_common/onedp/aio/_client.py +168 -0
- azure/ai/evaluation/_common/onedp/aio/_configuration.py +72 -0
- azure/ai/evaluation/_common/onedp/aio/_patch.py +21 -0
- azure/ai/evaluation/_common/onedp/aio/operations/__init__.py +49 -0
- azure/ai/evaluation/_common/onedp/aio/operations/_operations.py +7143 -0
- azure/ai/evaluation/_common/onedp/aio/operations/_patch.py +21 -0
- azure/ai/evaluation/_common/onedp/models/__init__.py +358 -0
- azure/ai/evaluation/_common/onedp/models/_enums.py +447 -0
- azure/ai/evaluation/_common/onedp/models/_models.py +5963 -0
- azure/ai/evaluation/_common/onedp/models/_patch.py +21 -0
- azure/ai/evaluation/_common/onedp/operations/__init__.py +49 -0
- azure/ai/evaluation/_common/onedp/operations/_operations.py +8951 -0
- azure/ai/evaluation/_common/onedp/operations/_patch.py +21 -0
- azure/ai/evaluation/_common/onedp/py.typed +1 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/__init__.py +1 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/aio/__init__.py +1 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/aio/operations/__init__.py +25 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/aio/operations/_operations.py +34 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/aio/operations/_patch.py +20 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/__init__.py +1 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/__init__.py +1 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/operations/__init__.py +22 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/operations/_operations.py +29 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/operations/_patch.py +20 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/operations/__init__.py +22 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/operations/_operations.py +29 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/operations/_patch.py +20 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/operations/__init__.py +25 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/operations/_operations.py +34 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/operations/_patch.py +20 -0
- azure/ai/evaluation/_common/rai_service.py +831 -142
- azure/ai/evaluation/_common/raiclient/__init__.py +34 -0
- azure/ai/evaluation/_common/raiclient/_client.py +128 -0
- azure/ai/evaluation/_common/raiclient/_configuration.py +87 -0
- azure/ai/evaluation/_common/raiclient/_model_base.py +1235 -0
- azure/ai/evaluation/_common/raiclient/_patch.py +20 -0
- azure/ai/evaluation/_common/raiclient/_serialization.py +2050 -0
- azure/ai/evaluation/_common/raiclient/_version.py +9 -0
- azure/ai/evaluation/_common/raiclient/aio/__init__.py +29 -0
- azure/ai/evaluation/_common/raiclient/aio/_client.py +130 -0
- azure/ai/evaluation/_common/raiclient/aio/_configuration.py +87 -0
- azure/ai/evaluation/_common/raiclient/aio/_patch.py +20 -0
- azure/ai/evaluation/_common/raiclient/aio/operations/__init__.py +25 -0
- azure/ai/evaluation/_common/raiclient/aio/operations/_operations.py +981 -0
- azure/ai/evaluation/_common/raiclient/aio/operations/_patch.py +20 -0
- azure/ai/evaluation/_common/raiclient/models/__init__.py +60 -0
- azure/ai/evaluation/_common/raiclient/models/_enums.py +18 -0
- azure/ai/evaluation/_common/raiclient/models/_models.py +651 -0
- azure/ai/evaluation/_common/raiclient/models/_patch.py +20 -0
- azure/ai/evaluation/_common/raiclient/operations/__init__.py +25 -0
- azure/ai/evaluation/_common/raiclient/operations/_operations.py +1238 -0
- azure/ai/evaluation/_common/raiclient/operations/_patch.py +20 -0
- azure/ai/evaluation/_common/raiclient/py.typed +1 -0
- azure/ai/evaluation/_common/utils.py +870 -34
- azure/ai/evaluation/_constants.py +167 -6
- azure/ai/evaluation/_converters/__init__.py +3 -0
- azure/ai/evaluation/_converters/_ai_services.py +899 -0
- azure/ai/evaluation/_converters/_models.py +467 -0
- azure/ai/evaluation/_converters/_sk_services.py +495 -0
- azure/ai/evaluation/_eval_mapping.py +83 -0
- azure/ai/evaluation/_evaluate/_batch_run/__init__.py +17 -0
- azure/ai/evaluation/_evaluate/_batch_run/_run_submitter_client.py +176 -0
- azure/ai/evaluation/_evaluate/_batch_run/batch_clients.py +82 -0
- azure/ai/evaluation/_evaluate/{_batch_run_client → _batch_run}/code_client.py +47 -25
- azure/ai/evaluation/_evaluate/{_batch_run_client/batch_run_context.py → _batch_run/eval_run_context.py} +42 -13
- azure/ai/evaluation/_evaluate/_batch_run/proxy_client.py +124 -0
- azure/ai/evaluation/_evaluate/_batch_run/target_run_context.py +62 -0
- azure/ai/evaluation/_evaluate/_eval_run.py +102 -59
- azure/ai/evaluation/_evaluate/_evaluate.py +2134 -311
- azure/ai/evaluation/_evaluate/_evaluate_aoai.py +992 -0
- azure/ai/evaluation/_evaluate/_telemetry/__init__.py +14 -99
- azure/ai/evaluation/_evaluate/_utils.py +289 -40
- azure/ai/evaluation/_evaluator_definition.py +76 -0
- azure/ai/evaluation/_evaluators/_bleu/_bleu.py +93 -42
- azure/ai/evaluation/_evaluators/_code_vulnerability/__init__.py +5 -0
- azure/ai/evaluation/_evaluators/_code_vulnerability/_code_vulnerability.py +119 -0
- azure/ai/evaluation/_evaluators/_coherence/_coherence.py +117 -91
- azure/ai/evaluation/_evaluators/_coherence/coherence.prompty +76 -39
- azure/ai/evaluation/_evaluators/_common/__init__.py +15 -0
- azure/ai/evaluation/_evaluators/_common/_base_eval.py +742 -0
- azure/ai/evaluation/_evaluators/_common/_base_multi_eval.py +63 -0
- azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +345 -0
- azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py +198 -0
- azure/ai/evaluation/_evaluators/_common/_conversation_aggregators.py +49 -0
- azure/ai/evaluation/_evaluators/_content_safety/__init__.py +0 -4
- azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py +144 -86
- azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py +138 -57
- azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py +123 -55
- azure/ai/evaluation/_evaluators/_content_safety/_sexual.py +133 -54
- azure/ai/evaluation/_evaluators/_content_safety/_violence.py +134 -54
- azure/ai/evaluation/_evaluators/_document_retrieval/__init__.py +7 -0
- azure/ai/evaluation/_evaluators/_document_retrieval/_document_retrieval.py +442 -0
- azure/ai/evaluation/_evaluators/_eci/_eci.py +49 -56
- azure/ai/evaluation/_evaluators/_f1_score/_f1_score.py +102 -60
- azure/ai/evaluation/_evaluators/_fluency/_fluency.py +115 -92
- azure/ai/evaluation/_evaluators/_fluency/fluency.prompty +66 -41
- azure/ai/evaluation/_evaluators/_gleu/_gleu.py +90 -37
- azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +318 -82
- azure/ai/evaluation/_evaluators/_groundedness/groundedness_with_query.prompty +114 -0
- azure/ai/evaluation/_evaluators/_groundedness/groundedness_without_query.prompty +104 -0
- azure/ai/evaluation/{_evaluate/_batch_run_client → _evaluators/_intent_resolution}/__init__.py +3 -4
- azure/ai/evaluation/_evaluators/_intent_resolution/_intent_resolution.py +196 -0
- azure/ai/evaluation/_evaluators/_intent_resolution/intent_resolution.prompty +275 -0
- azure/ai/evaluation/_evaluators/_meteor/_meteor.py +107 -61
- azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py +104 -77
- azure/ai/evaluation/_evaluators/_qa/_qa.py +115 -63
- azure/ai/evaluation/_evaluators/_relevance/_relevance.py +182 -98
- azure/ai/evaluation/_evaluators/_relevance/relevance.prompty +178 -49
- azure/ai/evaluation/_evaluators/_response_completeness/__init__.py +7 -0
- azure/ai/evaluation/_evaluators/_response_completeness/_response_completeness.py +202 -0
- azure/ai/evaluation/_evaluators/_response_completeness/response_completeness.prompty +84 -0
- azure/ai/evaluation/_evaluators/{_chat/retrieval → _retrieval}/__init__.py +2 -2
- azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py +148 -0
- azure/ai/evaluation/_evaluators/_retrieval/retrieval.prompty +93 -0
- azure/ai/evaluation/_evaluators/_rouge/_rouge.py +189 -50
- azure/ai/evaluation/_evaluators/_service_groundedness/__init__.py +9 -0
- azure/ai/evaluation/_evaluators/_service_groundedness/_service_groundedness.py +179 -0
- azure/ai/evaluation/_evaluators/_similarity/_similarity.py +102 -91
- azure/ai/evaluation/_evaluators/_similarity/similarity.prompty +0 -5
- azure/ai/evaluation/_evaluators/_task_adherence/__init__.py +7 -0
- azure/ai/evaluation/_evaluators/_task_adherence/_task_adherence.py +226 -0
- azure/ai/evaluation/_evaluators/_task_adherence/task_adherence.prompty +101 -0
- azure/ai/evaluation/_evaluators/_task_completion/__init__.py +7 -0
- azure/ai/evaluation/_evaluators/_task_completion/_task_completion.py +177 -0
- azure/ai/evaluation/_evaluators/_task_completion/task_completion.prompty +220 -0
- azure/ai/evaluation/_evaluators/_task_navigation_efficiency/__init__.py +7 -0
- azure/ai/evaluation/_evaluators/_task_navigation_efficiency/_task_navigation_efficiency.py +384 -0
- azure/ai/evaluation/_evaluators/_tool_call_accuracy/__init__.py +9 -0
- azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py +298 -0
- azure/ai/evaluation/_evaluators/_tool_call_accuracy/tool_call_accuracy.prompty +166 -0
- azure/ai/evaluation/_evaluators/_tool_input_accuracy/__init__.py +9 -0
- azure/ai/evaluation/_evaluators/_tool_input_accuracy/_tool_input_accuracy.py +263 -0
- azure/ai/evaluation/_evaluators/_tool_input_accuracy/tool_input_accuracy.prompty +76 -0
- azure/ai/evaluation/_evaluators/_tool_output_utilization/__init__.py +7 -0
- azure/ai/evaluation/_evaluators/_tool_output_utilization/_tool_output_utilization.py +225 -0
- azure/ai/evaluation/_evaluators/_tool_output_utilization/tool_output_utilization.prompty +221 -0
- azure/ai/evaluation/_evaluators/_tool_selection/__init__.py +9 -0
- azure/ai/evaluation/_evaluators/_tool_selection/_tool_selection.py +266 -0
- azure/ai/evaluation/_evaluators/_tool_selection/tool_selection.prompty +104 -0
- azure/ai/evaluation/_evaluators/_tool_success/__init__.py +7 -0
- azure/ai/evaluation/_evaluators/_tool_success/_tool_success.py +301 -0
- azure/ai/evaluation/_evaluators/_tool_success/tool_success.prompty +321 -0
- azure/ai/evaluation/_evaluators/_ungrounded_attributes/__init__.py +5 -0
- azure/ai/evaluation/_evaluators/_ungrounded_attributes/_ungrounded_attributes.py +102 -0
- azure/ai/evaluation/_evaluators/_xpia/xpia.py +109 -107
- azure/ai/evaluation/_exceptions.py +51 -7
- azure/ai/evaluation/_http_utils.py +210 -137
- azure/ai/evaluation/_legacy/__init__.py +3 -0
- azure/ai/evaluation/_legacy/_adapters/__init__.py +7 -0
- azure/ai/evaluation/_legacy/_adapters/_check.py +17 -0
- azure/ai/evaluation/_legacy/_adapters/_configuration.py +45 -0
- azure/ai/evaluation/_legacy/_adapters/_constants.py +10 -0
- azure/ai/evaluation/_legacy/_adapters/_errors.py +29 -0
- azure/ai/evaluation/_legacy/_adapters/_flows.py +28 -0
- azure/ai/evaluation/_legacy/_adapters/_service.py +16 -0
- azure/ai/evaluation/_legacy/_adapters/client.py +51 -0
- azure/ai/evaluation/_legacy/_adapters/entities.py +26 -0
- azure/ai/evaluation/_legacy/_adapters/tracing.py +28 -0
- azure/ai/evaluation/_legacy/_adapters/types.py +15 -0
- azure/ai/evaluation/_legacy/_adapters/utils.py +31 -0
- azure/ai/evaluation/_legacy/_batch_engine/__init__.py +9 -0
- azure/ai/evaluation/_legacy/_batch_engine/_config.py +48 -0
- azure/ai/evaluation/_legacy/_batch_engine/_engine.py +477 -0
- azure/ai/evaluation/_legacy/_batch_engine/_exceptions.py +88 -0
- azure/ai/evaluation/_legacy/_batch_engine/_openai_injector.py +132 -0
- azure/ai/evaluation/_legacy/_batch_engine/_result.py +107 -0
- azure/ai/evaluation/_legacy/_batch_engine/_run.py +127 -0
- azure/ai/evaluation/_legacy/_batch_engine/_run_storage.py +128 -0
- azure/ai/evaluation/_legacy/_batch_engine/_run_submitter.py +262 -0
- azure/ai/evaluation/_legacy/_batch_engine/_status.py +25 -0
- azure/ai/evaluation/_legacy/_batch_engine/_trace.py +97 -0
- azure/ai/evaluation/_legacy/_batch_engine/_utils.py +97 -0
- azure/ai/evaluation/_legacy/_batch_engine/_utils_deprecated.py +131 -0
- azure/ai/evaluation/_legacy/_common/__init__.py +3 -0
- azure/ai/evaluation/_legacy/_common/_async_token_provider.py +117 -0
- azure/ai/evaluation/_legacy/_common/_logging.py +292 -0
- azure/ai/evaluation/_legacy/_common/_thread_pool_executor_with_context.py +17 -0
- azure/ai/evaluation/_legacy/prompty/__init__.py +36 -0
- azure/ai/evaluation/_legacy/prompty/_connection.py +119 -0
- azure/ai/evaluation/_legacy/prompty/_exceptions.py +139 -0
- azure/ai/evaluation/_legacy/prompty/_prompty.py +430 -0
- azure/ai/evaluation/_legacy/prompty/_utils.py +663 -0
- azure/ai/evaluation/_legacy/prompty/_yaml_utils.py +99 -0
- azure/ai/evaluation/_model_configurations.py +130 -8
- azure/ai/evaluation/_safety_evaluation/__init__.py +3 -0
- azure/ai/evaluation/_safety_evaluation/_generated_rai_client.py +0 -0
- azure/ai/evaluation/_safety_evaluation/_safety_evaluation.py +917 -0
- azure/ai/evaluation/_user_agent.py +32 -1
- azure/ai/evaluation/_vendor/__init__.py +3 -0
- azure/ai/evaluation/_vendor/rouge_score/__init__.py +14 -0
- azure/ai/evaluation/_vendor/rouge_score/rouge_scorer.py +324 -0
- azure/ai/evaluation/_vendor/rouge_score/scoring.py +59 -0
- azure/ai/evaluation/_vendor/rouge_score/tokenize.py +59 -0
- azure/ai/evaluation/_vendor/rouge_score/tokenizers.py +53 -0
- azure/ai/evaluation/_version.py +2 -1
- azure/ai/evaluation/red_team/__init__.py +22 -0
- azure/ai/evaluation/red_team/_agent/__init__.py +3 -0
- azure/ai/evaluation/red_team/_agent/_agent_functions.py +261 -0
- azure/ai/evaluation/red_team/_agent/_agent_tools.py +461 -0
- azure/ai/evaluation/red_team/_agent/_agent_utils.py +89 -0
- azure/ai/evaluation/red_team/_agent/_semantic_kernel_plugin.py +228 -0
- azure/ai/evaluation/red_team/_attack_objective_generator.py +268 -0
- azure/ai/evaluation/red_team/_attack_strategy.py +49 -0
- azure/ai/evaluation/red_team/_callback_chat_target.py +115 -0
- azure/ai/evaluation/red_team/_default_converter.py +21 -0
- azure/ai/evaluation/red_team/_evaluation_processor.py +505 -0
- azure/ai/evaluation/red_team/_mlflow_integration.py +430 -0
- azure/ai/evaluation/red_team/_orchestrator_manager.py +803 -0
- azure/ai/evaluation/red_team/_red_team.py +1717 -0
- azure/ai/evaluation/red_team/_red_team_result.py +661 -0
- azure/ai/evaluation/red_team/_result_processor.py +1708 -0
- azure/ai/evaluation/red_team/_utils/__init__.py +37 -0
- azure/ai/evaluation/red_team/_utils/_rai_service_eval_chat_target.py +128 -0
- azure/ai/evaluation/red_team/_utils/_rai_service_target.py +601 -0
- azure/ai/evaluation/red_team/_utils/_rai_service_true_false_scorer.py +114 -0
- azure/ai/evaluation/red_team/_utils/constants.py +72 -0
- azure/ai/evaluation/red_team/_utils/exception_utils.py +345 -0
- azure/ai/evaluation/red_team/_utils/file_utils.py +266 -0
- azure/ai/evaluation/red_team/_utils/formatting_utils.py +365 -0
- azure/ai/evaluation/red_team/_utils/logging_utils.py +139 -0
- azure/ai/evaluation/red_team/_utils/metric_mapping.py +73 -0
- azure/ai/evaluation/red_team/_utils/objective_utils.py +46 -0
- azure/ai/evaluation/red_team/_utils/progress_utils.py +252 -0
- azure/ai/evaluation/red_team/_utils/retry_utils.py +218 -0
- azure/ai/evaluation/red_team/_utils/strategy_utils.py +218 -0
- azure/ai/evaluation/simulator/__init__.py +2 -1
- azure/ai/evaluation/simulator/_adversarial_scenario.py +26 -1
- azure/ai/evaluation/simulator/_adversarial_simulator.py +270 -144
- azure/ai/evaluation/simulator/_constants.py +12 -1
- azure/ai/evaluation/simulator/_conversation/__init__.py +151 -23
- azure/ai/evaluation/simulator/_conversation/_conversation.py +10 -6
- azure/ai/evaluation/simulator/_conversation/constants.py +1 -1
- azure/ai/evaluation/simulator/_data_sources/__init__.py +3 -0
- azure/ai/evaluation/simulator/_data_sources/grounding.json +1150 -0
- azure/ai/evaluation/simulator/_direct_attack_simulator.py +54 -75
- azure/ai/evaluation/simulator/_helpers/__init__.py +1 -2
- azure/ai/evaluation/simulator/_helpers/_language_suffix_mapping.py +1 -0
- azure/ai/evaluation/simulator/_helpers/_simulator_data_classes.py +26 -5
- azure/ai/evaluation/simulator/_indirect_attack_simulator.py +145 -104
- azure/ai/evaluation/simulator/_model_tools/__init__.py +2 -1
- azure/ai/evaluation/simulator/_model_tools/_generated_rai_client.py +225 -0
- azure/ai/evaluation/simulator/_model_tools/_identity_manager.py +80 -30
- azure/ai/evaluation/simulator/_model_tools/_proxy_completion_model.py +117 -45
- azure/ai/evaluation/simulator/_model_tools/_rai_client.py +109 -7
- azure/ai/evaluation/simulator/_model_tools/_template_handler.py +97 -33
- azure/ai/evaluation/simulator/_model_tools/models.py +30 -27
- azure/ai/evaluation/simulator/_prompty/task_query_response.prompty +6 -10
- azure/ai/evaluation/simulator/_prompty/task_simulate.prompty +6 -5
- azure/ai/evaluation/simulator/_simulator.py +302 -208
- azure/ai/evaluation/simulator/_utils.py +31 -13
- azure_ai_evaluation-1.13.3.dist-info/METADATA +939 -0
- azure_ai_evaluation-1.13.3.dist-info/RECORD +305 -0
- {azure_ai_evaluation-1.0.0b2.dist-info → azure_ai_evaluation-1.13.3.dist-info}/WHEEL +1 -1
- azure_ai_evaluation-1.13.3.dist-info/licenses/NOTICE.txt +70 -0
- azure/ai/evaluation/_evaluate/_batch_run_client/proxy_client.py +0 -71
- azure/ai/evaluation/_evaluators/_chat/_chat.py +0 -357
- azure/ai/evaluation/_evaluators/_chat/retrieval/_retrieval.py +0 -157
- azure/ai/evaluation/_evaluators/_chat/retrieval/retrieval.prompty +0 -48
- azure/ai/evaluation/_evaluators/_content_safety/_content_safety_base.py +0 -65
- azure/ai/evaluation/_evaluators/_content_safety/_content_safety_chat.py +0 -301
- azure/ai/evaluation/_evaluators/_groundedness/groundedness.prompty +0 -54
- azure/ai/evaluation/_evaluators/_protected_materials/__init__.py +0 -5
- azure/ai/evaluation/_evaluators/_protected_materials/_protected_materials.py +0 -104
- azure/ai/evaluation/simulator/_tracing.py +0 -89
- azure_ai_evaluation-1.0.0b2.dist-info/METADATA +0 -449
- azure_ai_evaluation-1.0.0b2.dist-info/RECORD +0 -99
- {azure_ai_evaluation-1.0.0b2.dist-info → azure_ai_evaluation-1.13.3.dist-info}/top_level.txt +0 -0
|
@@ -6,41 +6,38 @@ import functools
|
|
|
6
6
|
import inspect
|
|
7
7
|
import json
|
|
8
8
|
import logging
|
|
9
|
-
from typing import Callable, Dict,
|
|
9
|
+
from typing import Callable, Dict, Literal, Optional, Union, cast
|
|
10
10
|
|
|
11
11
|
import pandas as pd
|
|
12
|
-
from
|
|
13
|
-
from
|
|
14
|
-
from
|
|
15
|
-
from
|
|
16
|
-
from promptflow.core import Prompty as prompty_core
|
|
12
|
+
from azure.ai.evaluation._legacy._adapters._flows import FlexFlow as flex_flow
|
|
13
|
+
from azure.ai.evaluation._legacy._adapters._flows import AsyncPrompty as prompty_sdk
|
|
14
|
+
from azure.ai.evaluation._legacy._adapters._flows import Flow as dag_flow
|
|
15
|
+
from azure.ai.evaluation._legacy._adapters.client import PFClient
|
|
17
16
|
from typing_extensions import ParamSpec
|
|
18
17
|
|
|
19
|
-
from
|
|
18
|
+
from azure.ai.evaluation._model_configurations import AzureAIProject, EvaluationResult
|
|
19
|
+
|
|
20
20
|
from .._utils import _trace_destination_from_project_scope
|
|
21
21
|
|
|
22
22
|
LOGGER = logging.getLogger(__name__)
|
|
23
23
|
|
|
24
24
|
P = ParamSpec("P")
|
|
25
|
-
R = TypeVar("R")
|
|
26
25
|
|
|
27
26
|
|
|
28
|
-
def _get_evaluator_type(evaluator: Dict[str, Callable]):
|
|
27
|
+
def _get_evaluator_type(evaluator: Dict[str, Callable]) -> Literal["content-safety", "built-in", "custom"]:
|
|
29
28
|
"""
|
|
30
29
|
Get evaluator type for telemetry.
|
|
31
30
|
|
|
32
31
|
:param evaluator: The evaluator object
|
|
33
32
|
:type evaluator: Dict[str, Callable]
|
|
34
33
|
:return: The evaluator type. Possible values are "built-in", "custom", and "content-safety".
|
|
35
|
-
:rtype:
|
|
34
|
+
:rtype: Literal["content-safety", "built-in", "custom"]
|
|
36
35
|
"""
|
|
37
|
-
built_in = False
|
|
38
|
-
content_safety = False
|
|
39
|
-
|
|
40
36
|
module = inspect.getmodule(evaluator)
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
37
|
+
module_name = module.__name__ if module else ""
|
|
38
|
+
|
|
39
|
+
built_in = module_name.startswith("azure.ai.evaluation._evaluators.")
|
|
40
|
+
content_safety = built_in and module_name.startswith("azure.ai.evaluation._evaluators._content_safety")
|
|
44
41
|
|
|
45
42
|
if content_safety:
|
|
46
43
|
return "content-safety"
|
|
@@ -67,7 +64,7 @@ def _get_evaluator_properties(evaluator, evaluator_name):
|
|
|
67
64
|
|
|
68
65
|
try:
|
|
69
66
|
# Cover flex flow and prompty based evaluator
|
|
70
|
-
if isinstance(evaluator, (prompty_sdk,
|
|
67
|
+
if isinstance(evaluator, (prompty_sdk, flex_flow)):
|
|
71
68
|
name = evaluator.name
|
|
72
69
|
pf_type = evaluator.__class__.__name__
|
|
73
70
|
# Cover dag flow based evaluator
|
|
@@ -95,85 +92,3 @@ def _get_evaluator_properties(evaluator, evaluator_name):
|
|
|
95
92
|
"type": _get_evaluator_type(evaluator),
|
|
96
93
|
"alias": evaluator_name if evaluator_name else "",
|
|
97
94
|
}
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
# cspell:ignore isna
|
|
101
|
-
def log_evaluate_activity(func: Callable[P, R]) -> Callable[P, R]:
|
|
102
|
-
"""Decorator to log evaluate activity
|
|
103
|
-
|
|
104
|
-
:param func: The function to be decorated
|
|
105
|
-
:type func: Callable
|
|
106
|
-
:returns: The decorated function
|
|
107
|
-
:rtype: Callable[P, R]
|
|
108
|
-
"""
|
|
109
|
-
|
|
110
|
-
@functools.wraps(func)
|
|
111
|
-
def wrapper(*args: P.args, **kwargs: P.kwargs) -> R:
|
|
112
|
-
from promptflow._sdk._telemetry import ActivityType, log_activity
|
|
113
|
-
from promptflow._sdk._telemetry.telemetry import get_telemetry_logger
|
|
114
|
-
|
|
115
|
-
evaluators = kwargs.get("evaluators", [])
|
|
116
|
-
azure_ai_project = kwargs.get("azure_ai_project", None)
|
|
117
|
-
|
|
118
|
-
pf_client = PFClient(
|
|
119
|
-
config=(
|
|
120
|
-
{"trace.destination": _trace_destination_from_project_scope(azure_ai_project)}
|
|
121
|
-
if azure_ai_project
|
|
122
|
-
else None
|
|
123
|
-
),
|
|
124
|
-
user_agent=USER_AGENT,
|
|
125
|
-
)
|
|
126
|
-
|
|
127
|
-
track_in_cloud = bool(pf_client._config.get_trace_destination()) # pylint: disable=protected-access
|
|
128
|
-
evaluate_target = bool(kwargs.get("target", None))
|
|
129
|
-
evaluator_config = bool(kwargs.get("evaluator_config", None))
|
|
130
|
-
custom_dimensions = {
|
|
131
|
-
"track_in_cloud": track_in_cloud,
|
|
132
|
-
"evaluate_target": evaluate_target,
|
|
133
|
-
"evaluator_config": evaluator_config,
|
|
134
|
-
}
|
|
135
|
-
|
|
136
|
-
with log_activity(
|
|
137
|
-
get_telemetry_logger(),
|
|
138
|
-
"pf.evals.evaluate",
|
|
139
|
-
activity_type=ActivityType.PUBLICAPI,
|
|
140
|
-
user_agent=USER_AGENT,
|
|
141
|
-
custom_dimensions=custom_dimensions,
|
|
142
|
-
):
|
|
143
|
-
result = func(*args, **kwargs)
|
|
144
|
-
|
|
145
|
-
try:
|
|
146
|
-
evaluators_info = []
|
|
147
|
-
for evaluator_name, evaluator in evaluators.items():
|
|
148
|
-
evaluator_info = _get_evaluator_properties(evaluator, evaluator_name)
|
|
149
|
-
try:
|
|
150
|
-
evaluator_df = pd.DataFrame(result.get("rows", [])).filter(
|
|
151
|
-
like=f"outputs.{evaluator_name}", axis=1
|
|
152
|
-
)
|
|
153
|
-
|
|
154
|
-
failed_rows = (
|
|
155
|
-
evaluator_df.shape[0] if evaluator_df.empty else int(evaluator_df.isna().any(axis=1).sum())
|
|
156
|
-
)
|
|
157
|
-
total_rows = evaluator_df.shape[0]
|
|
158
|
-
|
|
159
|
-
evaluator_info["failed_rows"] = failed_rows
|
|
160
|
-
evaluator_info["total_rows"] = total_rows
|
|
161
|
-
except Exception as e: # pylint: disable=broad-exception-caught
|
|
162
|
-
LOGGER.debug("Failed to collect evaluate failed row info for %s: %s", evaluator_name, e)
|
|
163
|
-
evaluators_info.append(evaluator_info)
|
|
164
|
-
|
|
165
|
-
custom_dimensions = {"evaluators_info": json.dumps(evaluators_info)}
|
|
166
|
-
with log_activity(
|
|
167
|
-
get_telemetry_logger(),
|
|
168
|
-
"pf.evals.evaluate_usage_info",
|
|
169
|
-
activity_type=ActivityType.PUBLICAPI,
|
|
170
|
-
user_agent=USER_AGENT,
|
|
171
|
-
custom_dimensions=custom_dimensions,
|
|
172
|
-
):
|
|
173
|
-
pass
|
|
174
|
-
except Exception as e: # pylint: disable=broad-exception-caught
|
|
175
|
-
LOGGER.debug("Failed to collect evaluate usage info: %s", e)
|
|
176
|
-
|
|
177
|
-
return result
|
|
178
|
-
|
|
179
|
-
return wrapper
|
|
@@ -6,15 +6,30 @@ import logging
|
|
|
6
6
|
import os
|
|
7
7
|
import re
|
|
8
8
|
import tempfile
|
|
9
|
-
from collections import namedtuple
|
|
10
9
|
from pathlib import Path
|
|
11
|
-
|
|
10
|
+
import time
|
|
11
|
+
from typing import Any, Dict, List, NamedTuple, Optional, Union, cast
|
|
12
|
+
import uuid
|
|
13
|
+
import base64
|
|
14
|
+
import math
|
|
12
15
|
|
|
13
16
|
import pandas as pd
|
|
17
|
+
from tqdm import tqdm
|
|
14
18
|
|
|
15
|
-
from azure.
|
|
16
|
-
from azure.ai.evaluation.
|
|
19
|
+
from azure.core.pipeline.policies import UserAgentPolicy
|
|
20
|
+
from azure.ai.evaluation._legacy._adapters.entities import Run
|
|
21
|
+
|
|
22
|
+
from azure.ai.evaluation._constants import (
|
|
23
|
+
DEFAULT_EVALUATION_RESULTS_FILE_NAME,
|
|
24
|
+
DefaultOpenEncoding,
|
|
25
|
+
EvaluationRunProperties,
|
|
26
|
+
Prefixes,
|
|
27
|
+
)
|
|
17
28
|
from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
|
|
29
|
+
from azure.ai.evaluation._model_configurations import AzureAIProject, EvaluationResult
|
|
30
|
+
from azure.ai.evaluation._version import VERSION
|
|
31
|
+
from azure.ai.evaluation._user_agent import UserAgentSingleton
|
|
32
|
+
from azure.ai.evaluation._azure._clients import LiteMLClient
|
|
18
33
|
|
|
19
34
|
LOGGER = logging.getLogger(__name__)
|
|
20
35
|
|
|
@@ -23,14 +38,22 @@ AZURE_WORKSPACE_REGEX_FORMAT = (
|
|
|
23
38
|
"(/providers/Microsoft.MachineLearningServices)?/workspaces/([^/]+)$"
|
|
24
39
|
)
|
|
25
40
|
|
|
26
|
-
AzureMLWorkspaceTriad = namedtuple("AzureMLWorkspace", ["subscription_id", "resource_group_name", "workspace_name"])
|
|
27
41
|
|
|
42
|
+
class AzureMLWorkspace(NamedTuple):
|
|
43
|
+
subscription_id: str
|
|
44
|
+
resource_group_name: str
|
|
45
|
+
workspace_name: str
|
|
28
46
|
|
|
29
|
-
|
|
47
|
+
|
|
48
|
+
def is_none(value) -> bool:
|
|
30
49
|
return value is None or str(value).lower() == "none"
|
|
31
50
|
|
|
32
51
|
|
|
33
|
-
def extract_workspace_triad_from_trace_provider(
|
|
52
|
+
def extract_workspace_triad_from_trace_provider( # pylint: disable=name-too-long
|
|
53
|
+
trace_provider: str,
|
|
54
|
+
) -> AzureMLWorkspace:
|
|
55
|
+
from azure.ai.evaluation._legacy._adapters.utils import get_workspace_triad_from_local
|
|
56
|
+
|
|
34
57
|
match = re.match(AZURE_WORKSPACE_REGEX_FORMAT, trace_provider)
|
|
35
58
|
if not match or len(match.groups()) != 5:
|
|
36
59
|
raise EvaluationException(
|
|
@@ -44,10 +67,20 @@ def extract_workspace_triad_from_trace_provider(trace_provider: str): # pylint:
|
|
|
44
67
|
category=ErrorCategory.INVALID_VALUE,
|
|
45
68
|
blame=ErrorBlame.UNKNOWN,
|
|
46
69
|
)
|
|
70
|
+
|
|
47
71
|
subscription_id = match.group(1)
|
|
48
72
|
resource_group_name = match.group(3)
|
|
49
73
|
workspace_name = match.group(5)
|
|
50
|
-
|
|
74
|
+
|
|
75
|
+
# In theory this if statement should never evaluate to True, but we'll keep it here just in case
|
|
76
|
+
# for backwards compatibility with what the original code that depended on promptflow-azure did
|
|
77
|
+
if not (subscription_id and resource_group_name and workspace_name):
|
|
78
|
+
local = get_workspace_triad_from_local()
|
|
79
|
+
subscription_id = subscription_id or local.subscription_id or os.getenv("AZUREML_ARM_SUBSCRIPTION")
|
|
80
|
+
resource_group_name = resource_group_name or local.resource_group_name or os.getenv("AZUREML_ARM_RESOURCEGROUP")
|
|
81
|
+
workspace_name = workspace_name or local.workspace_name or os.getenv("AZUREML_ARM_WORKSPACE_NAME")
|
|
82
|
+
|
|
83
|
+
return AzureMLWorkspace(subscription_id or "", resource_group_name or "", workspace_name or "")
|
|
51
84
|
|
|
52
85
|
|
|
53
86
|
def load_jsonl(path):
|
|
@@ -55,49 +88,186 @@ def load_jsonl(path):
|
|
|
55
88
|
return [json.loads(line) for line in f.readlines()]
|
|
56
89
|
|
|
57
90
|
|
|
58
|
-
def
|
|
59
|
-
|
|
91
|
+
def _store_multimodal_content(messages, tmpdir: str):
|
|
92
|
+
# verify if images folder exists
|
|
93
|
+
images_folder_path = os.path.join(tmpdir, "images")
|
|
94
|
+
os.makedirs(images_folder_path, exist_ok=True)
|
|
60
95
|
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
96
|
+
# traverse all messages and replace base64 image data with new file name.
|
|
97
|
+
for message in messages:
|
|
98
|
+
if isinstance(message.get("content", []), list):
|
|
99
|
+
for content in message.get("content", []):
|
|
100
|
+
process_message_content(content, images_folder_path)
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
def process_message_content(content, images_folder_path):
|
|
104
|
+
if content.get("type", "") == "image_url":
|
|
105
|
+
image_url = content.get("image_url")
|
|
106
|
+
|
|
107
|
+
if not image_url or "url" not in image_url:
|
|
108
|
+
return None
|
|
109
|
+
|
|
110
|
+
url = image_url["url"]
|
|
111
|
+
if not url.startswith("data:image/"):
|
|
112
|
+
return None
|
|
113
|
+
|
|
114
|
+
match = re.search("data:image/([^;]+);", url)
|
|
115
|
+
if not match:
|
|
116
|
+
return None
|
|
117
|
+
|
|
118
|
+
ext = match.group(1)
|
|
119
|
+
# Extract the base64 string
|
|
120
|
+
base64image = image_url["url"].replace(f"data:image/{ext};base64,", "")
|
|
121
|
+
|
|
122
|
+
# Generate a unique filename
|
|
123
|
+
image_file_name = f"{str(uuid.uuid4())}.{ext}"
|
|
124
|
+
image_url["url"] = f"images/{image_file_name}" # Replace the base64 URL with the file path
|
|
125
|
+
|
|
126
|
+
# Decode the base64 string to binary image data
|
|
127
|
+
image_data_binary = base64.b64decode(base64image)
|
|
128
|
+
|
|
129
|
+
# Write the binary image data to the file
|
|
130
|
+
image_file_path = os.path.join(images_folder_path, image_file_name)
|
|
131
|
+
with open(image_file_path, "wb") as f:
|
|
132
|
+
f.write(image_data_binary)
|
|
133
|
+
return None
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
def _log_metrics_and_instance_results_onedp(
|
|
137
|
+
metrics: Dict[str, Any],
|
|
138
|
+
instance_results: pd.DataFrame,
|
|
139
|
+
project_url: str,
|
|
140
|
+
evaluation_name: Optional[str],
|
|
141
|
+
name_map: Dict[str, str],
|
|
142
|
+
tags: Optional[Dict[str, str]] = None,
|
|
143
|
+
**kwargs,
|
|
144
|
+
) -> Optional[str]:
|
|
145
|
+
|
|
146
|
+
# One RP Client
|
|
147
|
+
from azure.ai.evaluation._azure._token_manager import AzureMLTokenManager
|
|
148
|
+
from azure.ai.evaluation._constants import TokenScope
|
|
149
|
+
from azure.ai.evaluation._common import EvaluationServiceOneDPClient, EvaluationUpload
|
|
150
|
+
|
|
151
|
+
credentials = AzureMLTokenManager(
|
|
152
|
+
TokenScope.COGNITIVE_SERVICES_MANAGEMENT.value, LOGGER, credential=kwargs.get("credential")
|
|
66
153
|
)
|
|
154
|
+
client = EvaluationServiceOneDPClient(
|
|
155
|
+
endpoint=project_url,
|
|
156
|
+
credential=credentials,
|
|
157
|
+
user_agent_policy=UserAgentPolicy(base_user_agent=UserAgentSingleton().value),
|
|
158
|
+
)
|
|
159
|
+
|
|
160
|
+
# Massaging before artifacts are put on disk
|
|
161
|
+
# Adding line_number as index column this is needed by UI to form link to individual instance run
|
|
162
|
+
instance_results["line_number"] = instance_results.index.values
|
|
163
|
+
|
|
164
|
+
artifact_name = "instance_results.jsonl"
|
|
165
|
+
|
|
166
|
+
with tempfile.TemporaryDirectory() as tmpdir:
|
|
167
|
+
# storing multi_modal images if exists
|
|
168
|
+
col_name = "inputs.conversation"
|
|
169
|
+
if col_name in instance_results.columns:
|
|
170
|
+
for item in instance_results[col_name].items():
|
|
171
|
+
value = item[1]
|
|
172
|
+
if "messages" in value:
|
|
173
|
+
_store_multimodal_content(value["messages"], tmpdir)
|
|
174
|
+
|
|
175
|
+
# storing artifact result
|
|
176
|
+
tmp_path = os.path.join(tmpdir, artifact_name)
|
|
177
|
+
|
|
178
|
+
with open(tmp_path, "w", encoding=DefaultOpenEncoding.WRITE) as f:
|
|
179
|
+
f.write(instance_results.to_json(orient="records", lines=True))
|
|
67
180
|
|
|
68
|
-
|
|
181
|
+
properties = {
|
|
182
|
+
EvaluationRunProperties.RUN_TYPE: "eval_run",
|
|
183
|
+
EvaluationRunProperties.EVALUATION_SDK: f"azure-ai-evaluation:{VERSION}",
|
|
184
|
+
"_azureml.evaluate_artifacts": json.dumps([{"path": artifact_name, "type": "table"}]),
|
|
185
|
+
}
|
|
186
|
+
properties.update(_convert_name_map_into_property_entries(name_map))
|
|
187
|
+
|
|
188
|
+
create_evaluation_result_response = client.create_evaluation_result(
|
|
189
|
+
name=uuid.uuid4(), path=tmpdir, metrics=metrics
|
|
190
|
+
)
|
|
191
|
+
|
|
192
|
+
upload_run_response = client.start_evaluation_run(
|
|
193
|
+
evaluation=EvaluationUpload(
|
|
194
|
+
display_name=evaluation_name,
|
|
195
|
+
properties=properties,
|
|
196
|
+
tags=tags,
|
|
197
|
+
)
|
|
198
|
+
)
|
|
199
|
+
|
|
200
|
+
# TODO: type mis-match because Evaluation instance is assigned to EvaluationRun
|
|
201
|
+
evaluation_id = (
|
|
202
|
+
upload_run_response.name # type: ignore[attr-defined]
|
|
203
|
+
if hasattr(upload_run_response, "name")
|
|
204
|
+
else upload_run_response.id
|
|
205
|
+
)
|
|
206
|
+
update_run_response = client.update_evaluation_run(
|
|
207
|
+
name=evaluation_id,
|
|
208
|
+
evaluation=EvaluationUpload(
|
|
209
|
+
display_name=evaluation_name,
|
|
210
|
+
status="Completed",
|
|
211
|
+
outputs={
|
|
212
|
+
"evaluationResultId": create_evaluation_result_response.id,
|
|
213
|
+
},
|
|
214
|
+
),
|
|
215
|
+
)
|
|
216
|
+
|
|
217
|
+
return update_run_response.properties.get("AiStudioEvaluationUri")
|
|
69
218
|
|
|
70
219
|
|
|
71
220
|
def _log_metrics_and_instance_results(
|
|
72
|
-
metrics,
|
|
73
|
-
instance_results,
|
|
74
|
-
trace_destination,
|
|
75
|
-
run,
|
|
76
|
-
evaluation_name,
|
|
77
|
-
|
|
221
|
+
metrics: Dict[str, Any],
|
|
222
|
+
instance_results: pd.DataFrame,
|
|
223
|
+
trace_destination: Optional[str],
|
|
224
|
+
run: Optional[Run],
|
|
225
|
+
evaluation_name: Optional[str],
|
|
226
|
+
name_map: Dict[str, str],
|
|
227
|
+
tags: Optional[Dict[str, str]] = None,
|
|
228
|
+
**kwargs,
|
|
229
|
+
) -> Optional[str]:
|
|
230
|
+
from azure.ai.evaluation._evaluate._eval_run import EvalRun
|
|
231
|
+
|
|
78
232
|
if trace_destination is None:
|
|
79
|
-
LOGGER.
|
|
233
|
+
LOGGER.debug("Skip uploading evaluation results to AI Studio since no trace destination was provided.")
|
|
80
234
|
return None
|
|
81
235
|
|
|
82
|
-
|
|
83
|
-
|
|
236
|
+
ws_triad = extract_workspace_triad_from_trace_provider(trace_destination)
|
|
237
|
+
management_client = LiteMLClient(
|
|
238
|
+
subscription_id=ws_triad.subscription_id,
|
|
239
|
+
resource_group=ws_triad.resource_group_name,
|
|
240
|
+
logger=LOGGER,
|
|
241
|
+
credential=kwargs.get("credential"),
|
|
242
|
+
# let the client automatically determine the credentials to use
|
|
243
|
+
)
|
|
244
|
+
tracking_uri = management_client.workspace_get_info(ws_triad.workspace_name).ml_flow_tracking_uri
|
|
84
245
|
|
|
85
246
|
# Adding line_number as index column this is needed by UI to form link to individual instance run
|
|
86
247
|
instance_results["line_number"] = instance_results.index.values
|
|
87
248
|
|
|
88
249
|
with EvalRun(
|
|
89
250
|
run_name=run.name if run is not None else evaluation_name,
|
|
90
|
-
tracking_uri=tracking_uri,
|
|
251
|
+
tracking_uri=cast(str, tracking_uri),
|
|
91
252
|
subscription_id=ws_triad.subscription_id,
|
|
92
253
|
group_name=ws_triad.resource_group_name,
|
|
93
254
|
workspace_name=ws_triad.workspace_name,
|
|
94
|
-
|
|
255
|
+
management_client=management_client,
|
|
95
256
|
promptflow_run=run,
|
|
257
|
+
tags=tags,
|
|
96
258
|
) as ev_run:
|
|
97
|
-
|
|
98
|
-
artifact_name = EvalRun.EVALUATION_ARTIFACT if run else EvalRun.EVALUATION_ARTIFACT_DUMMY_RUN
|
|
259
|
+
artifact_name = EvalRun.EVALUATION_ARTIFACT
|
|
99
260
|
|
|
100
261
|
with tempfile.TemporaryDirectory() as tmpdir:
|
|
262
|
+
# storing multi_modal images if exists
|
|
263
|
+
col_name = "inputs.conversation"
|
|
264
|
+
if col_name in instance_results.columns:
|
|
265
|
+
for item in instance_results[col_name].items():
|
|
266
|
+
value = item[1]
|
|
267
|
+
if "messages" in value:
|
|
268
|
+
_store_multimodal_content(value["messages"], tmpdir)
|
|
269
|
+
|
|
270
|
+
# storing artifact result
|
|
101
271
|
tmp_path = os.path.join(tmpdir, artifact_name)
|
|
102
272
|
|
|
103
273
|
with open(tmp_path, "w", encoding=DefaultOpenEncoding.WRITE) as f:
|
|
@@ -110,11 +280,18 @@ def _log_metrics_and_instance_results(
|
|
|
110
280
|
# adding these properties to avoid showing traces if a dummy run is created.
|
|
111
281
|
# We are doing that only for the pure evaluation runs.
|
|
112
282
|
if run is None:
|
|
283
|
+
properties = {
|
|
284
|
+
EvaluationRunProperties.RUN_TYPE: "eval_run",
|
|
285
|
+
EvaluationRunProperties.EVALUATION_RUN: "promptflow.BatchRun",
|
|
286
|
+
EvaluationRunProperties.EVALUATION_SDK: f"azure-ai-evaluation:{VERSION}",
|
|
287
|
+
"_azureml.evaluate_artifacts": json.dumps([{"path": artifact_name, "type": "table"}]),
|
|
288
|
+
}
|
|
289
|
+
properties.update(_convert_name_map_into_property_entries(name_map))
|
|
290
|
+
ev_run.write_properties_to_run_history(properties=properties)
|
|
291
|
+
else:
|
|
113
292
|
ev_run.write_properties_to_run_history(
|
|
114
293
|
properties={
|
|
115
|
-
|
|
116
|
-
"_azureml.evaluate_artifacts": json.dumps([{"path": artifact_name, "type": "table"}]),
|
|
117
|
-
"isEvaluatorRun": "true",
|
|
294
|
+
EvaluationRunProperties.EVALUATION_SDK: f"azure-ai-evaluation:{VERSION}",
|
|
118
295
|
}
|
|
119
296
|
)
|
|
120
297
|
|
|
@@ -138,7 +315,7 @@ def _get_ai_studio_url(trace_destination: str, evaluation_id: str) -> str:
|
|
|
138
315
|
return studio_url
|
|
139
316
|
|
|
140
317
|
|
|
141
|
-
def _trace_destination_from_project_scope(project_scope:
|
|
318
|
+
def _trace_destination_from_project_scope(project_scope: AzureAIProject) -> str:
|
|
142
319
|
subscription_id = project_scope["subscription_id"]
|
|
143
320
|
resource_group_name = project_scope["resource_group_name"]
|
|
144
321
|
workspace_name = project_scope["project_name"]
|
|
@@ -151,17 +328,24 @@ def _trace_destination_from_project_scope(project_scope: dict) -> str:
|
|
|
151
328
|
return trace_destination
|
|
152
329
|
|
|
153
330
|
|
|
154
|
-
def _write_output(path, data_dict):
|
|
331
|
+
def _write_output(path: Union[str, os.PathLike], data_dict: Any) -> None:
|
|
155
332
|
p = Path(path)
|
|
156
|
-
if
|
|
333
|
+
if p.is_dir():
|
|
157
334
|
p = p / DEFAULT_EVALUATION_RESULTS_FILE_NAME
|
|
158
335
|
|
|
159
336
|
with open(p, "w", encoding=DefaultOpenEncoding.WRITE) as f:
|
|
160
|
-
json.dump(data_dict, f)
|
|
337
|
+
json.dump(data_dict, f, ensure_ascii=False)
|
|
338
|
+
|
|
339
|
+
# Use tqdm.write to print message without interfering with any current progress bar
|
|
340
|
+
# Fall back to regular print if tqdm.write fails (e.g., when progress bar is closed)
|
|
341
|
+
try:
|
|
342
|
+
tqdm.write(f'Evaluation results saved to "{p.resolve()}".\n')
|
|
343
|
+
except Exception:
|
|
344
|
+
print(f'Evaluation results saved to "{p.resolve()}".\n')
|
|
161
345
|
|
|
162
346
|
|
|
163
347
|
def _apply_column_mapping(
|
|
164
|
-
source_df: pd.DataFrame, mapping_config: Dict[str, str], inplace: bool = False
|
|
348
|
+
source_df: pd.DataFrame, mapping_config: Optional[Dict[str, str]], inplace: bool = False
|
|
165
349
|
) -> pd.DataFrame:
|
|
166
350
|
"""
|
|
167
351
|
Apply column mapping to source_df based on mapping_config.
|
|
@@ -211,7 +395,7 @@ def _apply_column_mapping(
|
|
|
211
395
|
return result_df
|
|
212
396
|
|
|
213
397
|
|
|
214
|
-
def _has_aggregator(evaluator):
|
|
398
|
+
def _has_aggregator(evaluator: object) -> bool:
|
|
215
399
|
return hasattr(evaluator, "__aggregate__")
|
|
216
400
|
|
|
217
401
|
|
|
@@ -234,11 +418,76 @@ def get_int_env_var(env_var_name: str, default_value: int) -> int:
|
|
|
234
418
|
return default_value
|
|
235
419
|
|
|
236
420
|
|
|
237
|
-
def set_event_loop_policy():
|
|
421
|
+
def set_event_loop_policy() -> None:
|
|
238
422
|
import asyncio
|
|
239
423
|
import platform
|
|
240
424
|
|
|
241
425
|
if platform.system().lower() == "windows":
|
|
242
426
|
# Reference: https://stackoverflow.com/questions/45600579/asyncio-event-loop-is-closed-when-getting-loop
|
|
243
427
|
# On Windows seems to be a problem with EventLoopPolicy, use this snippet to work around it
|
|
244
|
-
asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())
|
|
428
|
+
asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy()) # type: ignore[attr-defined]
|
|
429
|
+
|
|
430
|
+
|
|
431
|
+
# textwrap.wrap tries to do fancy nonsense that we don't want
|
|
432
|
+
def _wrap(s, w):
|
|
433
|
+
return [s[i : i + w] for i in range(0, len(s), w)]
|
|
434
|
+
|
|
435
|
+
|
|
436
|
+
def _convert_name_map_into_property_entries(
|
|
437
|
+
name_map: Dict[str, str], segment_length: int = 950, max_segments: int = 10
|
|
438
|
+
) -> Dict[str, Any]:
|
|
439
|
+
"""
|
|
440
|
+
Convert the name map into property entries.
|
|
441
|
+
|
|
442
|
+
:param name_map: The name map to be converted.
|
|
443
|
+
:type name_map: Dict[str, str]
|
|
444
|
+
:param segment_length: The max length of each individual segment,
|
|
445
|
+
which will each have their own dictionary entry
|
|
446
|
+
:type segment_length: str
|
|
447
|
+
:param max_segments: The max number of segments we can have. If the stringified
|
|
448
|
+
name map is too long, we just return a length entry with a value
|
|
449
|
+
of -1 to indicate that the map was too long.
|
|
450
|
+
:type max_segments: str
|
|
451
|
+
:return: The converted name map.
|
|
452
|
+
:rtype: Dict[str, Any]
|
|
453
|
+
"""
|
|
454
|
+
name_map_string = json.dumps(name_map)
|
|
455
|
+
num_segments = math.ceil(len(name_map_string) / segment_length)
|
|
456
|
+
# Property map is somehow still too long to encode within the space
|
|
457
|
+
# we allow, so give up, but make sure the service knows we gave up
|
|
458
|
+
if num_segments > max_segments:
|
|
459
|
+
return {EvaluationRunProperties.NAME_MAP_LENGTH: -1}
|
|
460
|
+
|
|
461
|
+
result: Dict[str, Any] = {EvaluationRunProperties.NAME_MAP_LENGTH: num_segments}
|
|
462
|
+
segments_list = _wrap(name_map_string, segment_length)
|
|
463
|
+
for i in range(0, num_segments):
|
|
464
|
+
segment_key = f"{EvaluationRunProperties.NAME_MAP}_{i}"
|
|
465
|
+
result[segment_key] = segments_list[i]
|
|
466
|
+
return result
|
|
467
|
+
|
|
468
|
+
|
|
469
|
+
class JSONLDataFileLoader:
|
|
470
|
+
def __init__(self, filename: Union[os.PathLike, str]):
|
|
471
|
+
self.filename = filename
|
|
472
|
+
|
|
473
|
+
def load(self) -> pd.DataFrame:
|
|
474
|
+
return pd.read_json(self.filename, lines=True, dtype=object)
|
|
475
|
+
|
|
476
|
+
|
|
477
|
+
class CSVDataFileLoader:
|
|
478
|
+
def __init__(self, filename: Union[os.PathLike, str]):
|
|
479
|
+
self.filename = filename
|
|
480
|
+
|
|
481
|
+
def load(self) -> pd.DataFrame:
|
|
482
|
+
return pd.read_csv(self.filename, dtype=str)
|
|
483
|
+
|
|
484
|
+
|
|
485
|
+
class DataLoaderFactory:
|
|
486
|
+
@staticmethod
|
|
487
|
+
def get_loader(filename: Union[os.PathLike, str]) -> Union[JSONLDataFileLoader, CSVDataFileLoader]:
|
|
488
|
+
filename_str = str(filename).lower()
|
|
489
|
+
if filename_str.endswith(".csv"):
|
|
490
|
+
return CSVDataFileLoader(filename)
|
|
491
|
+
|
|
492
|
+
# fallback to JSONL to maintain backward compatibility
|
|
493
|
+
return JSONLDataFileLoader(filename)
|
|
@@ -0,0 +1,76 @@
|
|
|
1
|
+
from abc import ABC
|
|
2
|
+
from typing import Dict, List, Optional, Any
|
|
3
|
+
from dataclasses import dataclass, field
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
@dataclass
|
|
7
|
+
class EvaluatorMetric:
|
|
8
|
+
type: str = "ordinal"
|
|
9
|
+
desirable_direction: Optional[str] = None
|
|
10
|
+
min_value: Optional[float] = None
|
|
11
|
+
max_value: Optional[float] = None
|
|
12
|
+
|
|
13
|
+
def to_dict(self) -> Dict[str, Any]:
|
|
14
|
+
result = {"type": self.type}
|
|
15
|
+
if self.desirable_direction is not None:
|
|
16
|
+
result["desirable_direction"] = self.desirable_direction
|
|
17
|
+
if self.min_value is not None:
|
|
18
|
+
result["min_value"] = self.min_value
|
|
19
|
+
if self.max_value is not None:
|
|
20
|
+
result["max_value"] = self.max_value
|
|
21
|
+
return result
|
|
22
|
+
|
|
23
|
+
@classmethod
|
|
24
|
+
def from_dict(cls, data: Dict[str, Any]) -> "EvaluatorMetric":
|
|
25
|
+
return cls(
|
|
26
|
+
type=data.get("type", "ordinal"),
|
|
27
|
+
desirable_direction=data.get("desirable_direction"),
|
|
28
|
+
min_value=data.get("min_value"),
|
|
29
|
+
max_value=data.get("max_value"),
|
|
30
|
+
)
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
@dataclass
|
|
34
|
+
class ObjectParameterDescriptorWithRequired:
|
|
35
|
+
required: List[str] = field(default_factory=list)
|
|
36
|
+
type: str = "object"
|
|
37
|
+
properties: Dict[str, Any] = field(default_factory=dict)
|
|
38
|
+
|
|
39
|
+
def to_dict(self) -> Dict[str, Any]:
|
|
40
|
+
return {"required": self.required, "type": self.type, "properties": self.properties}
|
|
41
|
+
|
|
42
|
+
@classmethod
|
|
43
|
+
def from_dict(cls, data: Dict[str, Any]) -> "ObjectParameterDescriptorWithRequired":
|
|
44
|
+
return cls(
|
|
45
|
+
required=data.get("required", []), type=data.get("type", "object"), properties=data.get("properties", {})
|
|
46
|
+
)
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
class EvaluatorDefinition(ABC):
|
|
50
|
+
"""Base class for evaluator definitions"""
|
|
51
|
+
|
|
52
|
+
def __init__(self):
|
|
53
|
+
self.init_parameters: ObjectParameterDescriptorWithRequired = ObjectParameterDescriptorWithRequired()
|
|
54
|
+
self.metrics: Dict[str, EvaluatorMetric] = {}
|
|
55
|
+
self.data_schema: ObjectParameterDescriptorWithRequired = ObjectParameterDescriptorWithRequired()
|
|
56
|
+
self.type: str = "unknown"
|
|
57
|
+
|
|
58
|
+
def to_dict(self) -> Dict[str, Any]:
|
|
59
|
+
result = {
|
|
60
|
+
"type": self.type,
|
|
61
|
+
"init_parameters": self.init_parameters.to_dict(),
|
|
62
|
+
"metrics": {k: v.to_dict() for k, v in self.metrics.items()},
|
|
63
|
+
"data_schema": self.data_schema.to_dict(),
|
|
64
|
+
}
|
|
65
|
+
return result
|
|
66
|
+
|
|
67
|
+
@classmethod
|
|
68
|
+
def from_dict(cls, data: Dict[str, Any]) -> "EvaluatorDefinition":
|
|
69
|
+
# Create a generic instance since specific subclasses are not defined
|
|
70
|
+
instance = cls.__new__(cls)
|
|
71
|
+
instance.__init__()
|
|
72
|
+
|
|
73
|
+
instance.init_parameters = ObjectParameterDescriptorWithRequired.from_dict(data.get("init_parameters", {}))
|
|
74
|
+
instance.metrics = {k: EvaluatorMetric.from_dict(v) for k, v in data.get("metrics", {}).items()}
|
|
75
|
+
instance.data_schema = ObjectParameterDescriptorWithRequired.from_dict(data.get("data_schema", {}))
|
|
76
|
+
return instance
|