azure-ai-evaluation 1.4.0__py3-none-any.whl → 1.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of azure-ai-evaluation might be problematic. Click here for more details.
- azure/ai/evaluation/__init__.py +0 -16
- azure/ai/evaluation/_common/rai_service.py +1 -1
- azure/ai/evaluation/_common/utils.py +1 -1
- azure/ai/evaluation/_converters/__init__.py +1 -1
- azure/ai/evaluation/_converters/_ai_services.py +4 -4
- azure/ai/evaluation/_evaluate/_batch_run/code_client.py +18 -12
- azure/ai/evaluation/_evaluate/_batch_run/eval_run_context.py +9 -4
- azure/ai/evaluation/_evaluate/_batch_run/proxy_client.py +42 -22
- azure/ai/evaluation/_evaluate/_batch_run/target_run_context.py +1 -1
- azure/ai/evaluation/_evaluate/_eval_run.py +1 -1
- azure/ai/evaluation/_evaluate/_evaluate.py +84 -68
- azure/ai/evaluation/_evaluate/_telemetry/__init__.py +5 -89
- azure/ai/evaluation/_evaluate/_utils.py +3 -3
- azure/ai/evaluation/_evaluators/_common/_base_eval.py +1 -1
- azure/ai/evaluation/_evaluators/_common/_base_multi_eval.py +1 -1
- azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +1 -1
- azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +1 -1
- azure/ai/evaluation/_evaluators/_response_completeness/_response_completeness.py +1 -0
- azure/ai/evaluation/_legacy/_adapters/__init__.py +21 -0
- azure/ai/evaluation/_legacy/_adapters/_configuration.py +45 -0
- azure/ai/evaluation/_legacy/_adapters/_constants.py +10 -0
- azure/ai/evaluation/_legacy/_adapters/_errors.py +29 -0
- azure/ai/evaluation/_legacy/_adapters/_flows.py +28 -0
- azure/ai/evaluation/_legacy/_adapters/_service.py +16 -0
- azure/ai/evaluation/_legacy/_adapters/client.py +51 -0
- azure/ai/evaluation/_legacy/_adapters/entities.py +26 -0
- azure/ai/evaluation/_legacy/_adapters/tracing.py +28 -0
- azure/ai/evaluation/_legacy/_adapters/types.py +15 -0
- azure/ai/evaluation/_legacy/_adapters/utils.py +31 -0
- azure/ai/evaluation/_legacy/_batch_engine/_result.py +1 -1
- azure/ai/evaluation/_legacy/_batch_engine/_status.py +1 -1
- azure/ai/evaluation/_version.py +1 -1
- azure/ai/evaluation/red_team/__init__.py +19 -0
- azure/ai/evaluation/{_red_team → red_team}/_attack_objective_generator.py +3 -0
- azure/ai/evaluation/{_red_team → red_team}/_attack_strategy.py +3 -0
- azure/ai/evaluation/{_red_team → red_team}/_red_team.py +96 -67
- azure/ai/evaluation/red_team/_red_team_result.py +382 -0
- azure/ai/evaluation/{_red_team → red_team}/_utils/constants.py +2 -1
- azure/ai/evaluation/{_red_team → red_team}/_utils/formatting_utils.py +23 -22
- azure/ai/evaluation/{_red_team → red_team}/_utils/logging_utils.py +1 -1
- azure/ai/evaluation/{_red_team → red_team}/_utils/strategy_utils.py +8 -4
- azure/ai/evaluation/simulator/_simulator.py +1 -1
- {azure_ai_evaluation-1.4.0.dist-info → azure_ai_evaluation-1.5.0.dist-info}/METADATA +13 -2
- {azure_ai_evaluation-1.4.0.dist-info → azure_ai_evaluation-1.5.0.dist-info}/RECORD +50 -40
- azure/ai/evaluation/_red_team/_red_team_result.py +0 -246
- azure/ai/evaluation/_red_team/_utils/__init__.py +0 -3
- azure/ai/evaluation/simulator/_tracing.py +0 -89
- /azure/ai/evaluation/{_red_team → red_team}/_callback_chat_target.py +0 -0
- /azure/ai/evaluation/{_red_team → red_team}/_default_converter.py +0 -0
- /azure/ai/evaluation/{_red_team → red_team/_utils}/__init__.py +0 -0
- {azure_ai_evaluation-1.4.0.dist-info → azure_ai_evaluation-1.5.0.dist-info}/NOTICE.txt +0 -0
- {azure_ai_evaluation-1.4.0.dist-info → azure_ai_evaluation-1.5.0.dist-info}/WHEEL +0 -0
- {azure_ai_evaluation-1.4.0.dist-info → azure_ai_evaluation-1.5.0.dist-info}/top_level.txt +0 -0
azure/ai/evaluation/__init__.py
CHANGED
|
@@ -52,22 +52,6 @@ try:
|
|
|
52
52
|
except ImportError:
|
|
53
53
|
print("[INFO] Could not import AIAgentConverter. Please install the dependency with `pip install azure-ai-projects`.")
|
|
54
54
|
|
|
55
|
-
# RedTeam requires a dependency on pyrit, but python 3.9 is not supported by pyrit.
|
|
56
|
-
# So we only import it if it's available and the user has pyrit.
|
|
57
|
-
try:
|
|
58
|
-
from ._red_team._red_team import RedTeam
|
|
59
|
-
from ._red_team._attack_strategy import AttackStrategy
|
|
60
|
-
from ._red_team._attack_objective_generator import RiskCategory
|
|
61
|
-
from ._red_team._red_team_result import RedTeamOutput
|
|
62
|
-
_patch_all.extend([
|
|
63
|
-
"RedTeam",
|
|
64
|
-
"RedTeamOutput",
|
|
65
|
-
"AttackStrategy",
|
|
66
|
-
"RiskCategory",
|
|
67
|
-
])
|
|
68
|
-
except ImportError:
|
|
69
|
-
print("[INFO] Could not import RedTeam. Please install the dependency with `pip install azure-ai-evaluation[redteam]`.")
|
|
70
|
-
|
|
71
55
|
|
|
72
56
|
__all__ = [
|
|
73
57
|
"evaluate",
|
|
@@ -15,7 +15,7 @@ from string import Template
|
|
|
15
15
|
|
|
16
16
|
import jwt
|
|
17
17
|
|
|
18
|
-
from
|
|
18
|
+
from azure.ai.evaluation._legacy._adapters._errors import MissingRequiredPackage
|
|
19
19
|
from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
|
|
20
20
|
from azure.ai.evaluation._http_utils import AsyncHttpPipeline, get_async_http_client
|
|
21
21
|
from azure.ai.evaluation._model_configurations import AzureAIProject
|
|
@@ -9,7 +9,7 @@ from typing import Any, List, Literal, Mapping, Type, TypeVar, Tuple, Union, cas
|
|
|
9
9
|
|
|
10
10
|
import nltk
|
|
11
11
|
from typing_extensions import NotRequired, Required, TypeGuard
|
|
12
|
-
from
|
|
12
|
+
from azure.ai.evaluation._legacy._adapters._errors import MissingRequiredPackage
|
|
13
13
|
from azure.ai.evaluation._constants import AZURE_OPENAI_TYPE, OPENAI_TYPE
|
|
14
14
|
from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
|
|
15
15
|
from azure.ai.evaluation._model_configurations import (
|
|
@@ -667,7 +667,7 @@ class AIAgentConverter:
|
|
|
667
667
|
return evaluations
|
|
668
668
|
|
|
669
669
|
@staticmethod
|
|
670
|
-
def
|
|
670
|
+
def _run_ids_from_conversation(conversation: dict) -> List[str]:
|
|
671
671
|
"""
|
|
672
672
|
Extracts a list of unique run IDs from a conversation dictionary.
|
|
673
673
|
|
|
@@ -684,7 +684,7 @@ class AIAgentConverter:
|
|
|
684
684
|
return run_ids
|
|
685
685
|
|
|
686
686
|
@staticmethod
|
|
687
|
-
def
|
|
687
|
+
def _convert_from_conversation(
|
|
688
688
|
conversation: dict, run_id: str, exclude_tool_calls_previous_runs: bool = False
|
|
689
689
|
) -> dict:
|
|
690
690
|
"""
|
|
@@ -765,7 +765,7 @@ class AIAgentConverter:
|
|
|
765
765
|
return json.loads(final_result.to_json())
|
|
766
766
|
|
|
767
767
|
@staticmethod
|
|
768
|
-
def
|
|
768
|
+
def _convert_from_file(filename: str, run_id: str) -> dict:
|
|
769
769
|
"""
|
|
770
770
|
Converts the agent run from a JSON file to a format suitable for the OpenAI API, the JSON file being a thread.
|
|
771
771
|
|
|
@@ -801,4 +801,4 @@ class AIAgentConverter:
|
|
|
801
801
|
with open(filename, mode="r", encoding="utf-8") as file:
|
|
802
802
|
data = json.load(file)
|
|
803
803
|
|
|
804
|
-
return AIAgentConverter.
|
|
804
|
+
return AIAgentConverter._convert_from_conversation(data, run_id)
|
|
@@ -6,17 +6,17 @@ import json
|
|
|
6
6
|
import logging
|
|
7
7
|
import os
|
|
8
8
|
from concurrent.futures import Future
|
|
9
|
-
from
|
|
10
|
-
from typing import Any, Callable, Dict, Optional, Union, cast
|
|
9
|
+
from typing import Any, Callable, Dict, Optional, Sequence, Union, cast
|
|
11
10
|
|
|
12
11
|
import pandas as pd
|
|
13
|
-
from
|
|
14
|
-
from
|
|
12
|
+
from azure.ai.evaluation._legacy._adapters.types import AttrDict
|
|
13
|
+
from azure.ai.evaluation._legacy._adapters.tracing import ThreadPoolExecutorWithContext as ThreadPoolExecutor
|
|
15
14
|
|
|
16
15
|
from azure.ai.evaluation._evaluate._utils import _apply_column_mapping, _has_aggregator, get_int_env_var, load_jsonl
|
|
17
16
|
from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
|
|
18
17
|
|
|
19
18
|
from ..._constants import PF_BATCH_TIMEOUT_SEC, PF_BATCH_TIMEOUT_SEC_DEFAULT
|
|
19
|
+
from .batch_clients import BatchClientRun
|
|
20
20
|
|
|
21
21
|
LOGGER = logging.getLogger(__name__)
|
|
22
22
|
|
|
@@ -84,7 +84,7 @@ class CodeClient: # pylint: disable=client-accepts-api-version-keyword
|
|
|
84
84
|
for param in inspect.signature(evaluator).parameters.values()
|
|
85
85
|
if param.name not in ["args", "kwargs"]
|
|
86
86
|
}
|
|
87
|
-
for value in input_df.to_dict("records"):
|
|
87
|
+
for value in cast(Sequence[Dict[str, Any]], input_df.to_dict("records")):
|
|
88
88
|
# Filter out only the parameters that are present in the input data
|
|
89
89
|
# if no parameters then pass data as is
|
|
90
90
|
filtered_values = {k: v for k, v in value.items() if k in parameters} if len(parameters) > 0 else value
|
|
@@ -133,10 +133,10 @@ class CodeClient: # pylint: disable=client-accepts-api-version-keyword
|
|
|
133
133
|
def run(
|
|
134
134
|
self, # pylint: disable=unused-argument
|
|
135
135
|
flow: Callable,
|
|
136
|
-
data: Union[os.PathLike,
|
|
137
|
-
evaluator_name: Optional[str] = None,
|
|
136
|
+
data: Union[str, os.PathLike, pd.DataFrame],
|
|
138
137
|
column_mapping: Optional[Dict[str, str]] = None,
|
|
139
|
-
|
|
138
|
+
evaluator_name: Optional[str] = None,
|
|
139
|
+
**kwargs: Any,
|
|
140
140
|
) -> CodeRun:
|
|
141
141
|
input_df = data
|
|
142
142
|
if not isinstance(input_df, pd.DataFrame):
|
|
@@ -157,7 +157,7 @@ class CodeClient: # pylint: disable=client-accepts-api-version-keyword
|
|
|
157
157
|
evaluator=flow,
|
|
158
158
|
input_df=input_df,
|
|
159
159
|
column_mapping=column_mapping,
|
|
160
|
-
evaluator_name=evaluator_name,
|
|
160
|
+
evaluator_name=evaluator_name or "",
|
|
161
161
|
)
|
|
162
162
|
|
|
163
163
|
return CodeRun(
|
|
@@ -169,11 +169,13 @@ class CodeClient: # pylint: disable=client-accepts-api-version-keyword
|
|
|
169
169
|
),
|
|
170
170
|
)
|
|
171
171
|
|
|
172
|
-
def get_details(self,
|
|
172
|
+
def get_details(self, client_run: BatchClientRun, all_results: bool = False) -> pd.DataFrame:
|
|
173
|
+
run = self._get_result(client_run)
|
|
173
174
|
result_df = run.get_result_df(exclude_inputs=not all_results)
|
|
174
175
|
return result_df
|
|
175
176
|
|
|
176
|
-
def get_metrics(self,
|
|
177
|
+
def get_metrics(self, client_run: BatchClientRun) -> Dict[str, Any]:
|
|
178
|
+
run = self._get_result(client_run)
|
|
177
179
|
try:
|
|
178
180
|
aggregated_metrics = run.get_aggregated_metrics()
|
|
179
181
|
print("Aggregated metrics")
|
|
@@ -183,6 +185,10 @@ class CodeClient: # pylint: disable=client-accepts-api-version-keyword
|
|
|
183
185
|
return {}
|
|
184
186
|
return aggregated_metrics
|
|
185
187
|
|
|
186
|
-
def get_run_summary(self,
|
|
188
|
+
def get_run_summary(self, client_run: BatchClientRun) -> Any: # pylint: disable=unused-argument
|
|
187
189
|
# Not implemented
|
|
188
190
|
return None
|
|
191
|
+
|
|
192
|
+
@staticmethod
|
|
193
|
+
def _get_result(run: BatchClientRun) -> CodeRun:
|
|
194
|
+
return cast(CodeRun, run)
|
|
@@ -5,9 +5,9 @@ import os
|
|
|
5
5
|
import types
|
|
6
6
|
from typing import Optional, Type, Union
|
|
7
7
|
|
|
8
|
-
from
|
|
9
|
-
from
|
|
10
|
-
from
|
|
8
|
+
from azure.ai.evaluation._legacy._adapters._constants import PF_FLOW_ENTRY_IN_TMP, PF_FLOW_META_LOAD_IN_SUBPROCESS
|
|
9
|
+
from azure.ai.evaluation._legacy._adapters.utils import ClientUserAgentUtil
|
|
10
|
+
from azure.ai.evaluation._legacy._adapters.tracing import inject_openai_api, recover_openai_api
|
|
11
11
|
|
|
12
12
|
from azure.ai.evaluation._constants import (
|
|
13
13
|
OTEL_EXPORTER_OTLP_TRACES_TIMEOUT,
|
|
@@ -19,6 +19,8 @@ from azure.ai.evaluation._constants import (
|
|
|
19
19
|
|
|
20
20
|
from ..._user_agent import USER_AGENT
|
|
21
21
|
from .._utils import set_event_loop_policy
|
|
22
|
+
from .batch_clients import BatchClient
|
|
23
|
+
from ._run_submitter_client import RunSubmitterClient
|
|
22
24
|
from .code_client import CodeClient
|
|
23
25
|
from .proxy_client import ProxyClient
|
|
24
26
|
|
|
@@ -33,7 +35,7 @@ class EvalRunContext:
|
|
|
33
35
|
]
|
|
34
36
|
"""
|
|
35
37
|
|
|
36
|
-
def __init__(self, client:
|
|
38
|
+
def __init__(self, client: BatchClient) -> None:
|
|
37
39
|
self.client = client
|
|
38
40
|
self._is_batch_timeout_set_by_system = False
|
|
39
41
|
self._is_otel_timeout_set_by_system = False
|
|
@@ -64,6 +66,9 @@ class EvalRunContext:
|
|
|
64
66
|
# For addressing the issue of asyncio event loop closed on Windows
|
|
65
67
|
set_event_loop_policy()
|
|
66
68
|
|
|
69
|
+
if isinstance(self.client, RunSubmitterClient):
|
|
70
|
+
set_event_loop_policy()
|
|
71
|
+
|
|
67
72
|
def __exit__(
|
|
68
73
|
self,
|
|
69
74
|
exc_type: Optional[Type[BaseException]],
|
|
@@ -8,15 +8,21 @@ import inspect
|
|
|
8
8
|
import logging
|
|
9
9
|
import math
|
|
10
10
|
import os
|
|
11
|
+
from datetime import datetime
|
|
11
12
|
from collections import OrderedDict
|
|
12
13
|
from concurrent.futures import Future
|
|
13
|
-
from typing import Any, Callable, Dict, Optional, Union
|
|
14
|
+
from typing import Any, Callable, Dict, Optional, Union, cast
|
|
14
15
|
|
|
16
|
+
from azure.ai.evaluation._legacy._adapters.entities import Run
|
|
17
|
+
from azure.ai.evaluation._legacy._adapters._configuration import Configuration
|
|
18
|
+
from azure.ai.evaluation._legacy._adapters.client import PFClient
|
|
19
|
+
from azure.ai.evaluation._legacy._adapters.tracing import ThreadPoolExecutorWithContext
|
|
15
20
|
import pandas as pd
|
|
16
|
-
from promptflow.client import PFClient
|
|
17
|
-
from promptflow.entities import Run
|
|
18
|
-
from promptflow.tracing import ThreadPoolExecutorWithContext as ThreadPoolExecutor
|
|
19
21
|
|
|
22
|
+
from azure.ai.evaluation._evaluate._batch_run.batch_clients import BatchClientRun, HasAsyncCallable
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
Configuration.get_instance().set_config("trace.destination", "none")
|
|
20
26
|
LOGGER = logging.getLogger(__name__)
|
|
21
27
|
|
|
22
28
|
|
|
@@ -26,46 +32,56 @@ class ProxyRun:
|
|
|
26
32
|
|
|
27
33
|
|
|
28
34
|
class ProxyClient: # pylint: disable=client-accepts-api-version-keyword
|
|
29
|
-
def __init__( # pylint: disable=missing-client-constructor-parameter-credential
|
|
30
|
-
self,
|
|
35
|
+
def __init__( # pylint: disable=missing-client-constructor-parameter-credential
|
|
36
|
+
self,
|
|
37
|
+
**kwargs: Any,
|
|
31
38
|
) -> None:
|
|
32
|
-
self._pf_client =
|
|
33
|
-
self._thread_pool =
|
|
39
|
+
self._pf_client = PFClient(**kwargs)
|
|
40
|
+
self._thread_pool = ThreadPoolExecutorWithContext(thread_name_prefix="evaluators_thread")
|
|
34
41
|
|
|
35
42
|
def run(
|
|
36
43
|
self,
|
|
37
|
-
flow:
|
|
38
|
-
data: Union[str, os.PathLike],
|
|
44
|
+
flow: Callable,
|
|
45
|
+
data: Union[str, os.PathLike, pd.DataFrame],
|
|
39
46
|
column_mapping: Optional[Dict[str, str]] = None,
|
|
40
|
-
|
|
47
|
+
evaluator_name: Optional[str] = None,
|
|
48
|
+
**kwargs: Any,
|
|
41
49
|
) -> ProxyRun:
|
|
42
|
-
|
|
43
|
-
|
|
50
|
+
if isinstance(data, pd.DataFrame):
|
|
51
|
+
raise ValueError("Data cannot be a pandas DataFrame")
|
|
52
|
+
|
|
53
|
+
flow_to_run: Callable = flow
|
|
54
|
+
if os.getenv("AI_EVALS_BATCH_USE_ASYNC", "true").lower() == "true" and isinstance(flow, HasAsyncCallable):
|
|
44
55
|
flow_to_run = flow._to_async() # pylint: disable=protected-access
|
|
45
56
|
|
|
57
|
+
name: str = kwargs.pop("name", "")
|
|
58
|
+
if not name:
|
|
59
|
+
name = f"azure_ai_evaluation_evaluators_{evaluator_name}_{datetime.now().strftime('%Y%m%d_%H%M%S_%f')}"
|
|
60
|
+
|
|
46
61
|
batch_use_async = self._should_batch_use_async(flow_to_run)
|
|
47
62
|
eval_future = self._thread_pool.submit(
|
|
48
63
|
self._pf_client.run,
|
|
49
64
|
flow_to_run,
|
|
50
65
|
data=data,
|
|
51
|
-
column_mapping=column_mapping,
|
|
66
|
+
column_mapping=column_mapping, # type: ignore
|
|
52
67
|
batch_use_async=batch_use_async,
|
|
53
|
-
|
|
68
|
+
name=name,
|
|
69
|
+
**kwargs,
|
|
54
70
|
)
|
|
55
71
|
return ProxyRun(run=eval_future)
|
|
56
72
|
|
|
57
|
-
def get_details(self,
|
|
58
|
-
run: Run =
|
|
73
|
+
def get_details(self, client_run: BatchClientRun, all_results: bool = False) -> pd.DataFrame:
|
|
74
|
+
run: Run = self.get_result(client_run)
|
|
59
75
|
result_df = self._pf_client.get_details(run, all_results=all_results)
|
|
60
76
|
result_df.replace("(Failed)", math.nan, inplace=True)
|
|
61
77
|
return result_df
|
|
62
78
|
|
|
63
|
-
def get_metrics(self,
|
|
64
|
-
run: Run =
|
|
79
|
+
def get_metrics(self, client_run: BatchClientRun) -> Dict[str, Any]:
|
|
80
|
+
run: Run = self.get_result(client_run)
|
|
65
81
|
return self._pf_client.get_metrics(run)
|
|
66
82
|
|
|
67
|
-
def get_run_summary(self,
|
|
68
|
-
run =
|
|
83
|
+
def get_run_summary(self, client_run: BatchClientRun) -> Dict[str, Any]:
|
|
84
|
+
run: Run = self.get_result(client_run)
|
|
69
85
|
|
|
70
86
|
# pylint: disable=protected-access
|
|
71
87
|
completed_lines = run._properties.get("system_metrics", {}).get("__pf__.lines.completed", "NA")
|
|
@@ -81,13 +97,17 @@ class ProxyClient: # pylint: disable=client-accepts-api-version-keyword
|
|
|
81
97
|
return OrderedDict(
|
|
82
98
|
[
|
|
83
99
|
("status", status),
|
|
84
|
-
("duration", str(run._end_time - run._created_on)),
|
|
100
|
+
("duration", str((run._end_time or run._created_on) - run._created_on)),
|
|
85
101
|
("completed_lines", completed_lines),
|
|
86
102
|
("failed_lines", failed_lines),
|
|
87
103
|
("log_path", str(run._output_path)),
|
|
88
104
|
]
|
|
89
105
|
)
|
|
90
106
|
|
|
107
|
+
@staticmethod
|
|
108
|
+
def get_result(run: BatchClientRun) -> Run:
|
|
109
|
+
return cast(ProxyRun, run).run.result()
|
|
110
|
+
|
|
91
111
|
@staticmethod
|
|
92
112
|
def _should_batch_use_async(flow):
|
|
93
113
|
if os.getenv("AI_EVALS_BATCH_USE_ASYNC", "true").lower() == "true":
|
|
@@ -5,7 +5,7 @@ import os
|
|
|
5
5
|
import types
|
|
6
6
|
from typing import Optional, Type
|
|
7
7
|
|
|
8
|
-
from
|
|
8
|
+
from azure.ai.evaluation._legacy._adapters._constants import PF_FLOW_ENTRY_IN_TMP
|
|
9
9
|
from azure.ai.evaluation._constants import PF_DISABLE_TRACING
|
|
10
10
|
|
|
11
11
|
|
|
@@ -13,7 +13,7 @@ import uuid
|
|
|
13
13
|
from typing import Any, Dict, List, Optional, Set, Type
|
|
14
14
|
from urllib.parse import urlparse
|
|
15
15
|
|
|
16
|
-
from
|
|
16
|
+
from azure.ai.evaluation._legacy._adapters.entities import Run
|
|
17
17
|
from typing_extensions import Self
|
|
18
18
|
|
|
19
19
|
from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
|
|
@@ -6,13 +6,11 @@ import json
|
|
|
6
6
|
import logging
|
|
7
7
|
import os
|
|
8
8
|
import re
|
|
9
|
-
from typing import Any, Callable, Dict, List, Optional, Set, Tuple, TypedDict,
|
|
9
|
+
from typing import Any, Callable, Dict, List, Optional, Set, Tuple, TypedDict, Union, cast
|
|
10
10
|
|
|
11
|
+
from azure.ai.evaluation._legacy._adapters._constants import LINE_NUMBER
|
|
12
|
+
from azure.ai.evaluation._legacy._adapters.entities import Run
|
|
11
13
|
import pandas as pd
|
|
12
|
-
from promptflow._sdk._constants import LINE_NUMBER
|
|
13
|
-
from promptflow.client import PFClient
|
|
14
|
-
from promptflow.entities import Run
|
|
15
|
-
from promptflow._sdk._configuration import Configuration
|
|
16
14
|
|
|
17
15
|
from azure.ai.evaluation._common.math import list_mean_nan_safe, apply_transform_nan_safe
|
|
18
16
|
from azure.ai.evaluation._common.utils import validate_azure_ai_project
|
|
@@ -27,7 +25,14 @@ from .._constants import (
|
|
|
27
25
|
)
|
|
28
26
|
from .._model_configurations import AzureAIProject, EvaluationResult, EvaluatorConfig
|
|
29
27
|
from .._user_agent import USER_AGENT
|
|
30
|
-
from ._batch_run import
|
|
28
|
+
from ._batch_run import (
|
|
29
|
+
EvalRunContext,
|
|
30
|
+
CodeClient,
|
|
31
|
+
ProxyClient,
|
|
32
|
+
ProxyRun,
|
|
33
|
+
TargetRunContext,
|
|
34
|
+
RunSubmitterClient,
|
|
35
|
+
)
|
|
31
36
|
from ._utils import (
|
|
32
37
|
_apply_column_mapping,
|
|
33
38
|
_log_metrics_and_instance_results,
|
|
@@ -35,8 +40,8 @@ from ._utils import (
|
|
|
35
40
|
_write_output,
|
|
36
41
|
DataLoaderFactory,
|
|
37
42
|
)
|
|
43
|
+
from ._batch_run.batch_clients import BatchClient
|
|
38
44
|
|
|
39
|
-
TClient = TypeVar("TClient", ProxyClient, CodeClient)
|
|
40
45
|
LOGGER = logging.getLogger(__name__)
|
|
41
46
|
|
|
42
47
|
# For metrics (aggregates) whose metric names intentionally differ from their
|
|
@@ -71,7 +76,7 @@ def _aggregate_other_metrics(df: pd.DataFrame) -> Tuple[List[str], Dict[str, flo
|
|
|
71
76
|
if metric_name in METRIC_COLUMN_NAME_REPLACEMENTS:
|
|
72
77
|
renamed_cols.append(col)
|
|
73
78
|
new_col_name = metric_prefix + "." + METRIC_COLUMN_NAME_REPLACEMENTS[metric_name]
|
|
74
|
-
col_with_numeric_values = pd.to_numeric(df[col], errors="coerce")
|
|
79
|
+
col_with_numeric_values = cast(List[float], pd.to_numeric(df[col], errors="coerce"))
|
|
75
80
|
try:
|
|
76
81
|
metric_columns[new_col_name] = round(list_mean_nan_safe(col_with_numeric_values), 2)
|
|
77
82
|
except EvaluationException: # only exception that can be cause is all NaN values
|
|
@@ -122,7 +127,7 @@ def _aggregate_content_safety_metrics(
|
|
|
122
127
|
defect_rates = {}
|
|
123
128
|
for col in content_safety_df.columns:
|
|
124
129
|
defect_rate_name = col.replace("_score", "_defect_rate")
|
|
125
|
-
col_with_numeric_values = pd.to_numeric(content_safety_df[col], errors="coerce")
|
|
130
|
+
col_with_numeric_values = cast(List[float], pd.to_numeric(content_safety_df[col], errors="coerce"))
|
|
126
131
|
try:
|
|
127
132
|
col_with_boolean_values = apply_transform_nan_safe(
|
|
128
133
|
col_with_numeric_values, lambda x: 1 if x >= CONTENT_SAFETY_DEFECT_RATE_THRESHOLD_DEFAULT else 0
|
|
@@ -161,37 +166,40 @@ def _aggregate_label_defect_metrics(df: pd.DataFrame) -> Tuple[List[str], Dict[s
|
|
|
161
166
|
metric_name = col.split(".")[1]
|
|
162
167
|
if metric_name.endswith("_label") and metric_name.replace("_label", "").lower() in handled_metrics:
|
|
163
168
|
label_cols.append(col)
|
|
164
|
-
if metric_name.endswith("_details") and metric_name.replace("_details", "").lower() in handled_metrics:
|
|
169
|
+
if metric_name.endswith("_details") and metric_name.replace("_details", "").lower() in handled_metrics:
|
|
165
170
|
details_cols = col
|
|
166
171
|
|
|
167
172
|
label_df = df[label_cols]
|
|
168
173
|
defect_rates = {}
|
|
169
174
|
for col in label_df.columns:
|
|
170
175
|
defect_rate_name = col.replace("_label", "_defect_rate")
|
|
171
|
-
col_with_boolean_values = pd.to_numeric(label_df[col], errors="coerce")
|
|
176
|
+
col_with_boolean_values = cast(List[float], pd.to_numeric(label_df[col], errors="coerce"))
|
|
172
177
|
try:
|
|
173
178
|
defect_rates[defect_rate_name] = round(list_mean_nan_safe(col_with_boolean_values), 2)
|
|
174
179
|
except EvaluationException: # only exception that can be cause is all NaN values
|
|
175
180
|
msg = f"All score evaluations are NaN/None for column {col}. No aggregation can be performed."
|
|
176
181
|
LOGGER.warning(msg)
|
|
177
|
-
|
|
182
|
+
|
|
178
183
|
if details_cols:
|
|
179
184
|
details_df = df[details_cols]
|
|
180
185
|
detail_defect_rates = {}
|
|
181
|
-
|
|
186
|
+
|
|
182
187
|
for key, value in details_df.items():
|
|
183
188
|
_process_rows(value, detail_defect_rates)
|
|
184
|
-
|
|
189
|
+
|
|
185
190
|
for key, value in detail_defect_rates.items():
|
|
186
191
|
col_with_boolean_values = pd.to_numeric(value, errors="coerce")
|
|
187
192
|
try:
|
|
188
|
-
defect_rates[f"{details_cols}.{key}_defect_rate"] = round(
|
|
193
|
+
defect_rates[f"{details_cols}.{key}_defect_rate"] = round(
|
|
194
|
+
list_mean_nan_safe(col_with_boolean_values), 2
|
|
195
|
+
)
|
|
189
196
|
except EvaluationException: # only exception that can be cause is all NaN values
|
|
190
197
|
msg = f"All score evaluations are NaN/None for column {key}. No aggregation can be performed."
|
|
191
198
|
LOGGER.warning(msg)
|
|
192
|
-
|
|
199
|
+
|
|
193
200
|
return label_cols, defect_rates
|
|
194
201
|
|
|
202
|
+
|
|
195
203
|
def _process_rows(row, detail_defect_rates):
|
|
196
204
|
for key, value in row.items():
|
|
197
205
|
if key not in detail_defect_rates:
|
|
@@ -199,6 +207,7 @@ def _process_rows(row, detail_defect_rates):
|
|
|
199
207
|
detail_defect_rates[key].append(value)
|
|
200
208
|
return detail_defect_rates
|
|
201
209
|
|
|
210
|
+
|
|
202
211
|
def _aggregate_metrics(df: pd.DataFrame, evaluators: Dict[str, Callable]) -> Dict[str, float]:
|
|
203
212
|
"""Aggregate metrics from the evaluation results.
|
|
204
213
|
On top of naively calculating the mean of most metrics, this function also identifies certain columns
|
|
@@ -330,7 +339,7 @@ def _validate_columns_for_evaluators(
|
|
|
330
339
|
missing_inputs = []
|
|
331
340
|
else:
|
|
332
341
|
optional_params = (
|
|
333
|
-
evaluator._OPTIONAL_PARAMS # pylint: disable=protected-access
|
|
342
|
+
cast(Any, evaluator)._OPTIONAL_PARAMS # pylint: disable=protected-access
|
|
334
343
|
if hasattr(evaluator, "_OPTIONAL_PARAMS")
|
|
335
344
|
else []
|
|
336
345
|
)
|
|
@@ -478,7 +487,7 @@ def _validate_and_load_data(target, data, evaluators, output_path, azure_ai_proj
|
|
|
478
487
|
def _apply_target_to_data(
|
|
479
488
|
target: Callable,
|
|
480
489
|
data: Union[str, os.PathLike],
|
|
481
|
-
batch_client:
|
|
490
|
+
batch_client: BatchClient,
|
|
482
491
|
initial_data: pd.DataFrame,
|
|
483
492
|
evaluation_name: Optional[str] = None,
|
|
484
493
|
**kwargs,
|
|
@@ -499,14 +508,21 @@ def _apply_target_to_data(
|
|
|
499
508
|
:return: The tuple, containing data frame and the list of added columns.
|
|
500
509
|
:rtype: Tuple[pandas.DataFrame, List[str]]
|
|
501
510
|
"""
|
|
511
|
+
|
|
512
|
+
if not isinstance(batch_client, ProxyClient):
|
|
513
|
+
raise ValueError("Only ProxyClient supports target runs for now.")
|
|
514
|
+
|
|
502
515
|
_run_name = kwargs.get("_run_name")
|
|
503
516
|
with TargetRunContext():
|
|
504
|
-
run
|
|
505
|
-
|
|
506
|
-
|
|
507
|
-
|
|
508
|
-
|
|
509
|
-
|
|
517
|
+
run = cast(
|
|
518
|
+
ProxyRun,
|
|
519
|
+
batch_client.run(
|
|
520
|
+
flow=target,
|
|
521
|
+
display_name=evaluation_name,
|
|
522
|
+
data=data,
|
|
523
|
+
stream=True,
|
|
524
|
+
name=_run_name,
|
|
525
|
+
),
|
|
510
526
|
)
|
|
511
527
|
|
|
512
528
|
target_output: pd.DataFrame = batch_client.get_details(run, all_results=True)
|
|
@@ -606,7 +622,6 @@ def _rename_columns_conditionally(df: pd.DataFrame) -> pd.DataFrame:
|
|
|
606
622
|
return df
|
|
607
623
|
|
|
608
624
|
|
|
609
|
-
# @log_evaluate_activity
|
|
610
625
|
def evaluate(
|
|
611
626
|
*,
|
|
612
627
|
data: Union[str, os.PathLike],
|
|
@@ -757,20 +772,24 @@ def _evaluate( # pylint: disable=too-many-locals,too-many-statements
|
|
|
757
772
|
if target is not None:
|
|
758
773
|
_validate_columns_for_target(input_data_df, target)
|
|
759
774
|
|
|
760
|
-
Configuration.get_instance().set_config("trace.destination", "none")
|
|
761
|
-
pf_client = PFClient(user_agent=USER_AGENT)
|
|
762
|
-
target_run: Optional[Run] = None
|
|
763
|
-
|
|
764
775
|
# Create default configuration for evaluators that directly maps
|
|
765
776
|
# input data names to keyword inputs of the same name in the evaluators.
|
|
766
777
|
column_mapping = column_mapping or {}
|
|
767
778
|
column_mapping.setdefault("default", {})
|
|
768
779
|
|
|
769
|
-
|
|
780
|
+
target_run: Optional[Run] = None
|
|
770
781
|
target_generated_columns: Set[str] = set()
|
|
782
|
+
batch_run_client: BatchClient
|
|
783
|
+
batch_run_data: Union[str, os.PathLike, pd.DataFrame] = data
|
|
784
|
+
|
|
785
|
+
# If target is set, apply 1-1 column mapping from target outputs to evaluator inputs
|
|
771
786
|
if data is not None and target is not None:
|
|
787
|
+
# Right now, only the ProxyClient that uses Promptflow supports a target function
|
|
788
|
+
batch_run_client = ProxyClient(user_agent=USER_AGENT)
|
|
789
|
+
batch_run_data = os.path.abspath(data)
|
|
790
|
+
|
|
772
791
|
input_data_df, target_generated_columns, target_run = _apply_target_to_data(
|
|
773
|
-
target, data,
|
|
792
|
+
target, data, batch_run_client, input_data_df, evaluation_name, **kwargs
|
|
774
793
|
)
|
|
775
794
|
|
|
776
795
|
for evaluator_name, mapping in column_mapping.items():
|
|
@@ -784,6 +803,17 @@ def _evaluate( # pylint: disable=too-many-locals,too-many-statements
|
|
|
784
803
|
# customer did not mapped target output.
|
|
785
804
|
if col not in mapping and run_output not in mapped_to_values:
|
|
786
805
|
column_mapping[evaluator_name][col] = run_output # pylint: disable=unnecessary-dict-index-lookup
|
|
806
|
+
elif kwargs.pop("_use_run_submitter_client", False):
|
|
807
|
+
batch_run_client = RunSubmitterClient()
|
|
808
|
+
batch_run_data = input_data_df
|
|
809
|
+
elif kwargs.pop("_use_pf_client", True):
|
|
810
|
+
batch_run_client = ProxyClient(user_agent=USER_AGENT)
|
|
811
|
+
# Ensure the absolute path is passed to pf.run, as relative path doesn't work with
|
|
812
|
+
# multiple evaluators. If the path is already absolute, abspath will return the original path.
|
|
813
|
+
batch_run_data = os.path.abspath(data)
|
|
814
|
+
else:
|
|
815
|
+
batch_run_client = CodeClient()
|
|
816
|
+
batch_run_data = input_data_df
|
|
787
817
|
|
|
788
818
|
# After we have generated all columns, we can check if we have everything we need for evaluators.
|
|
789
819
|
_validate_columns_for_evaluators(input_data_df, evaluators, target, target_generated_columns, column_mapping)
|
|
@@ -799,46 +829,32 @@ def _evaluate( # pylint: disable=too-many-locals,too-many-statements
|
|
|
799
829
|
if not col.startswith(Prefixes.TSG_OUTPUTS) and col not in column_mapping["default"].keys():
|
|
800
830
|
column_mapping["default"][col] = f"${{data.{col}}}"
|
|
801
831
|
|
|
802
|
-
|
|
803
|
-
|
|
804
|
-
|
|
805
|
-
|
|
806
|
-
|
|
807
|
-
|
|
808
|
-
|
|
809
|
-
|
|
810
|
-
|
|
811
|
-
|
|
812
|
-
|
|
813
|
-
|
|
814
|
-
|
|
815
|
-
)
|
|
816
|
-
for evaluator_name, evaluator in evaluators.items()
|
|
817
|
-
}
|
|
832
|
+
with EvalRunContext(batch_run_client):
|
|
833
|
+
runs = {
|
|
834
|
+
evaluator_name: batch_run_client.run(
|
|
835
|
+
flow=evaluator,
|
|
836
|
+
data=batch_run_data,
|
|
837
|
+
run=target_run,
|
|
838
|
+
evaluator_name=evaluator_name,
|
|
839
|
+
column_mapping=column_mapping.get(evaluator_name, column_mapping.get("default", None)),
|
|
840
|
+
stream=True,
|
|
841
|
+
name=kwargs.get("_run_name"),
|
|
842
|
+
)
|
|
843
|
+
for evaluator_name, evaluator in evaluators.items()
|
|
844
|
+
}
|
|
818
845
|
|
|
819
|
-
|
|
820
|
-
|
|
821
|
-
|
|
822
|
-
|
|
823
|
-
|
|
824
|
-
|
|
825
|
-
}
|
|
826
|
-
for evaluator_name, run in runs.items()
|
|
846
|
+
# get_details needs to be called within EvalRunContext scope in order to have user agent populated
|
|
847
|
+
per_evaluator_results: Dict[str, __EvaluatorInfo] = {
|
|
848
|
+
evaluator_name: {
|
|
849
|
+
"result": batch_run_client.get_details(run, all_results=True),
|
|
850
|
+
"metrics": batch_run_client.get_metrics(run),
|
|
851
|
+
"run_summary": batch_run_client.get_run_summary(run),
|
|
827
852
|
}
|
|
828
|
-
|
|
829
|
-
|
|
830
|
-
use_pf_client = kwargs.get("_use_pf_client", True)
|
|
831
|
-
if use_pf_client:
|
|
832
|
-
# Ensure the absolute path is passed to pf.run, as relative path doesn't work with
|
|
833
|
-
# multiple evaluators. If the path is already absolute, abspath will return the original path.
|
|
834
|
-
data = os.path.abspath(data)
|
|
835
|
-
per_evaluator_results = eval_batch_run(ProxyClient(pf_client), data=data)
|
|
836
|
-
else:
|
|
837
|
-
data = input_data_df
|
|
838
|
-
per_evaluator_results = eval_batch_run(CodeClient(), data=input_data_df)
|
|
853
|
+
for evaluator_name, run in runs.items()
|
|
854
|
+
}
|
|
839
855
|
|
|
840
856
|
# Concatenate all results
|
|
841
|
-
evaluators_result_df =
|
|
857
|
+
evaluators_result_df = pd.DataFrame()
|
|
842
858
|
evaluators_metric = {}
|
|
843
859
|
for evaluator_name, evaluator_result in per_evaluator_results.items():
|
|
844
860
|
if fail_on_evaluator_errors and evaluator_result["run_summary"]["failed_lines"] > 0:
|
|
@@ -880,7 +896,7 @@ def _evaluate( # pylint: disable=too-many-locals,too-many-statements
|
|
|
880
896
|
metrics.update(evaluators_metric)
|
|
881
897
|
|
|
882
898
|
# Since tracing is disabled, pass None for target_run so a dummy evaluation run will be created each time.
|
|
883
|
-
target_run = None
|
|
899
|
+
target_run: Optional[Run] = None
|
|
884
900
|
trace_destination = _trace_destination_from_project_scope(azure_ai_project) if azure_ai_project else None
|
|
885
901
|
studio_url = None
|
|
886
902
|
if trace_destination:
|