azure-ai-evaluation 1.3.0__py3-none-any.whl → 1.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of azure-ai-evaluation might be problematic. Click here for more details.
- azure/ai/evaluation/__init__.py +27 -1
- azure/ai/evaluation/_azure/_models.py +6 -6
- azure/ai/evaluation/_common/constants.py +6 -2
- azure/ai/evaluation/_common/rai_service.py +39 -5
- azure/ai/evaluation/_common/raiclient/__init__.py +34 -0
- azure/ai/evaluation/_common/raiclient/_client.py +128 -0
- azure/ai/evaluation/_common/raiclient/_configuration.py +87 -0
- azure/ai/evaluation/_common/raiclient/_model_base.py +1235 -0
- azure/ai/evaluation/_common/raiclient/_patch.py +20 -0
- azure/ai/evaluation/_common/raiclient/_serialization.py +2050 -0
- azure/ai/evaluation/_common/raiclient/_version.py +9 -0
- azure/ai/evaluation/_common/raiclient/aio/__init__.py +29 -0
- azure/ai/evaluation/_common/raiclient/aio/_client.py +130 -0
- azure/ai/evaluation/_common/raiclient/aio/_configuration.py +87 -0
- azure/ai/evaluation/_common/raiclient/aio/_patch.py +20 -0
- azure/ai/evaluation/_common/raiclient/aio/operations/__init__.py +25 -0
- azure/ai/evaluation/_common/raiclient/aio/operations/_operations.py +981 -0
- azure/ai/evaluation/_common/raiclient/aio/operations/_patch.py +20 -0
- azure/ai/evaluation/_common/raiclient/models/__init__.py +60 -0
- azure/ai/evaluation/_common/raiclient/models/_enums.py +18 -0
- azure/ai/evaluation/_common/raiclient/models/_models.py +651 -0
- azure/ai/evaluation/_common/raiclient/models/_patch.py +20 -0
- azure/ai/evaluation/_common/raiclient/operations/__init__.py +25 -0
- azure/ai/evaluation/_common/raiclient/operations/_operations.py +1225 -0
- azure/ai/evaluation/_common/raiclient/operations/_patch.py +20 -0
- azure/ai/evaluation/_common/raiclient/py.typed +1 -0
- azure/ai/evaluation/_common/utils.py +23 -3
- azure/ai/evaluation/_constants.py +7 -0
- azure/ai/evaluation/_converters/__init__.py +3 -0
- azure/ai/evaluation/_converters/_ai_services.py +804 -0
- azure/ai/evaluation/_converters/_models.py +302 -0
- azure/ai/evaluation/_evaluate/_batch_run/__init__.py +10 -3
- azure/ai/evaluation/_evaluate/_batch_run/_run_submitter_client.py +104 -0
- azure/ai/evaluation/_evaluate/_batch_run/batch_clients.py +82 -0
- azure/ai/evaluation/_evaluate/_batch_run/code_client.py +18 -12
- azure/ai/evaluation/_evaluate/_batch_run/eval_run_context.py +9 -4
- azure/ai/evaluation/_evaluate/_batch_run/proxy_client.py +42 -22
- azure/ai/evaluation/_evaluate/_batch_run/target_run_context.py +1 -1
- azure/ai/evaluation/_evaluate/_eval_run.py +2 -2
- azure/ai/evaluation/_evaluate/_evaluate.py +109 -64
- azure/ai/evaluation/_evaluate/_telemetry/__init__.py +5 -89
- azure/ai/evaluation/_evaluate/_utils.py +3 -3
- azure/ai/evaluation/_evaluators/_bleu/_bleu.py +23 -3
- azure/ai/evaluation/_evaluators/_code_vulnerability/__init__.py +5 -0
- azure/ai/evaluation/_evaluators/_code_vulnerability/_code_vulnerability.py +120 -0
- azure/ai/evaluation/_evaluators/_coherence/_coherence.py +21 -2
- azure/ai/evaluation/_evaluators/_common/_base_eval.py +44 -4
- azure/ai/evaluation/_evaluators/_common/_base_multi_eval.py +4 -2
- azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +44 -5
- azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py +16 -4
- azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py +42 -5
- azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py +15 -0
- azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py +15 -0
- azure/ai/evaluation/_evaluators/_content_safety/_sexual.py +15 -0
- azure/ai/evaluation/_evaluators/_content_safety/_violence.py +15 -0
- azure/ai/evaluation/_evaluators/_f1_score/_f1_score.py +28 -4
- azure/ai/evaluation/_evaluators/_fluency/_fluency.py +21 -2
- azure/ai/evaluation/_evaluators/_gleu/_gleu.py +26 -3
- azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +22 -4
- azure/ai/evaluation/_evaluators/_intent_resolution/__init__.py +7 -0
- azure/ai/evaluation/_evaluators/_intent_resolution/_intent_resolution.py +152 -0
- azure/ai/evaluation/_evaluators/_intent_resolution/intent_resolution.prompty +161 -0
- azure/ai/evaluation/_evaluators/_meteor/_meteor.py +26 -3
- azure/ai/evaluation/_evaluators/_qa/_qa.py +51 -7
- azure/ai/evaluation/_evaluators/_relevance/_relevance.py +26 -2
- azure/ai/evaluation/_evaluators/_response_completeness/__init__.py +7 -0
- azure/ai/evaluation/_evaluators/_response_completeness/_response_completeness.py +158 -0
- azure/ai/evaluation/_evaluators/_response_completeness/response_completeness.prompty +99 -0
- azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py +21 -2
- azure/ai/evaluation/_evaluators/_rouge/_rouge.py +113 -4
- azure/ai/evaluation/_evaluators/_service_groundedness/_service_groundedness.py +23 -3
- azure/ai/evaluation/_evaluators/_similarity/_similarity.py +24 -5
- azure/ai/evaluation/_evaluators/_task_adherence/__init__.py +7 -0
- azure/ai/evaluation/_evaluators/_task_adherence/_task_adherence.py +148 -0
- azure/ai/evaluation/_evaluators/_task_adherence/task_adherence.prompty +117 -0
- azure/ai/evaluation/_evaluators/_tool_call_accuracy/__init__.py +9 -0
- azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py +292 -0
- azure/ai/evaluation/_evaluators/_tool_call_accuracy/tool_call_accuracy.prompty +71 -0
- azure/ai/evaluation/_evaluators/_ungrounded_attributes/__init__.py +5 -0
- azure/ai/evaluation/_evaluators/_ungrounded_attributes/_ungrounded_attributes.py +103 -0
- azure/ai/evaluation/_evaluators/_xpia/xpia.py +2 -0
- azure/ai/evaluation/_exceptions.py +5 -0
- azure/ai/evaluation/_legacy/__init__.py +3 -0
- azure/ai/evaluation/_legacy/_adapters/__init__.py +21 -0
- azure/ai/evaluation/_legacy/_adapters/_configuration.py +45 -0
- azure/ai/evaluation/_legacy/_adapters/_constants.py +10 -0
- azure/ai/evaluation/_legacy/_adapters/_errors.py +29 -0
- azure/ai/evaluation/_legacy/_adapters/_flows.py +28 -0
- azure/ai/evaluation/_legacy/_adapters/_service.py +16 -0
- azure/ai/evaluation/_legacy/_adapters/client.py +51 -0
- azure/ai/evaluation/_legacy/_adapters/entities.py +26 -0
- azure/ai/evaluation/_legacy/_adapters/tracing.py +28 -0
- azure/ai/evaluation/_legacy/_adapters/types.py +15 -0
- azure/ai/evaluation/_legacy/_adapters/utils.py +31 -0
- azure/ai/evaluation/_legacy/_batch_engine/__init__.py +9 -0
- azure/ai/evaluation/_legacy/_batch_engine/_config.py +45 -0
- azure/ai/evaluation/_legacy/_batch_engine/_engine.py +368 -0
- azure/ai/evaluation/_legacy/_batch_engine/_exceptions.py +88 -0
- azure/ai/evaluation/_legacy/_batch_engine/_logging.py +292 -0
- azure/ai/evaluation/_legacy/_batch_engine/_openai_injector.py +23 -0
- azure/ai/evaluation/_legacy/_batch_engine/_result.py +99 -0
- azure/ai/evaluation/_legacy/_batch_engine/_run.py +121 -0
- azure/ai/evaluation/_legacy/_batch_engine/_run_storage.py +128 -0
- azure/ai/evaluation/_legacy/_batch_engine/_run_submitter.py +217 -0
- azure/ai/evaluation/_legacy/_batch_engine/_status.py +25 -0
- azure/ai/evaluation/_legacy/_batch_engine/_trace.py +105 -0
- azure/ai/evaluation/_legacy/_batch_engine/_utils.py +82 -0
- azure/ai/evaluation/_legacy/_batch_engine/_utils_deprecated.py +131 -0
- azure/ai/evaluation/_legacy/prompty/__init__.py +36 -0
- azure/ai/evaluation/_legacy/prompty/_connection.py +182 -0
- azure/ai/evaluation/_legacy/prompty/_exceptions.py +59 -0
- azure/ai/evaluation/_legacy/prompty/_prompty.py +313 -0
- azure/ai/evaluation/_legacy/prompty/_utils.py +545 -0
- azure/ai/evaluation/_legacy/prompty/_yaml_utils.py +99 -0
- azure/ai/evaluation/_safety_evaluation/__init__.py +1 -1
- azure/ai/evaluation/_safety_evaluation/_generated_rai_client.py +0 -0
- azure/ai/evaluation/_safety_evaluation/_safety_evaluation.py +251 -150
- azure/ai/evaluation/_version.py +1 -1
- azure/ai/evaluation/red_team/__init__.py +19 -0
- azure/ai/evaluation/red_team/_attack_objective_generator.py +195 -0
- azure/ai/evaluation/red_team/_attack_strategy.py +45 -0
- azure/ai/evaluation/red_team/_callback_chat_target.py +74 -0
- azure/ai/evaluation/red_team/_default_converter.py +21 -0
- azure/ai/evaluation/red_team/_red_team.py +1887 -0
- azure/ai/evaluation/red_team/_red_team_result.py +382 -0
- azure/ai/evaluation/red_team/_utils/__init__.py +3 -0
- azure/ai/evaluation/red_team/_utils/constants.py +65 -0
- azure/ai/evaluation/red_team/_utils/formatting_utils.py +165 -0
- azure/ai/evaluation/red_team/_utils/logging_utils.py +139 -0
- azure/ai/evaluation/red_team/_utils/strategy_utils.py +192 -0
- azure/ai/evaluation/simulator/_adversarial_scenario.py +3 -1
- azure/ai/evaluation/simulator/_adversarial_simulator.py +54 -27
- azure/ai/evaluation/simulator/_model_tools/_generated_rai_client.py +145 -0
- azure/ai/evaluation/simulator/_model_tools/_rai_client.py +71 -1
- azure/ai/evaluation/simulator/_simulator.py +1 -1
- {azure_ai_evaluation-1.3.0.dist-info → azure_ai_evaluation-1.5.0.dist-info}/METADATA +80 -15
- azure_ai_evaluation-1.5.0.dist-info/RECORD +207 -0
- {azure_ai_evaluation-1.3.0.dist-info → azure_ai_evaluation-1.5.0.dist-info}/WHEEL +1 -1
- azure/ai/evaluation/simulator/_tracing.py +0 -89
- azure_ai_evaluation-1.3.0.dist-info/RECORD +0 -119
- {azure_ai_evaluation-1.3.0.dist-info → azure_ai_evaluation-1.5.0.dist-info}/NOTICE.txt +0 -0
- {azure_ai_evaluation-1.3.0.dist-info → azure_ai_evaluation-1.5.0.dist-info}/top_level.txt +0 -0
|
@@ -8,15 +8,21 @@ import inspect
|
|
|
8
8
|
import logging
|
|
9
9
|
import math
|
|
10
10
|
import os
|
|
11
|
+
from datetime import datetime
|
|
11
12
|
from collections import OrderedDict
|
|
12
13
|
from concurrent.futures import Future
|
|
13
|
-
from typing import Any, Callable, Dict, Optional, Union
|
|
14
|
+
from typing import Any, Callable, Dict, Optional, Union, cast
|
|
14
15
|
|
|
16
|
+
from azure.ai.evaluation._legacy._adapters.entities import Run
|
|
17
|
+
from azure.ai.evaluation._legacy._adapters._configuration import Configuration
|
|
18
|
+
from azure.ai.evaluation._legacy._adapters.client import PFClient
|
|
19
|
+
from azure.ai.evaluation._legacy._adapters.tracing import ThreadPoolExecutorWithContext
|
|
15
20
|
import pandas as pd
|
|
16
|
-
from promptflow.client import PFClient
|
|
17
|
-
from promptflow.entities import Run
|
|
18
|
-
from promptflow.tracing import ThreadPoolExecutorWithContext as ThreadPoolExecutor
|
|
19
21
|
|
|
22
|
+
from azure.ai.evaluation._evaluate._batch_run.batch_clients import BatchClientRun, HasAsyncCallable
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
Configuration.get_instance().set_config("trace.destination", "none")
|
|
20
26
|
LOGGER = logging.getLogger(__name__)
|
|
21
27
|
|
|
22
28
|
|
|
@@ -26,46 +32,56 @@ class ProxyRun:
|
|
|
26
32
|
|
|
27
33
|
|
|
28
34
|
class ProxyClient: # pylint: disable=client-accepts-api-version-keyword
|
|
29
|
-
def __init__( # pylint: disable=missing-client-constructor-parameter-credential
|
|
30
|
-
self,
|
|
35
|
+
def __init__( # pylint: disable=missing-client-constructor-parameter-credential
|
|
36
|
+
self,
|
|
37
|
+
**kwargs: Any,
|
|
31
38
|
) -> None:
|
|
32
|
-
self._pf_client =
|
|
33
|
-
self._thread_pool =
|
|
39
|
+
self._pf_client = PFClient(**kwargs)
|
|
40
|
+
self._thread_pool = ThreadPoolExecutorWithContext(thread_name_prefix="evaluators_thread")
|
|
34
41
|
|
|
35
42
|
def run(
|
|
36
43
|
self,
|
|
37
|
-
flow:
|
|
38
|
-
data: Union[str, os.PathLike],
|
|
44
|
+
flow: Callable,
|
|
45
|
+
data: Union[str, os.PathLike, pd.DataFrame],
|
|
39
46
|
column_mapping: Optional[Dict[str, str]] = None,
|
|
40
|
-
|
|
47
|
+
evaluator_name: Optional[str] = None,
|
|
48
|
+
**kwargs: Any,
|
|
41
49
|
) -> ProxyRun:
|
|
42
|
-
|
|
43
|
-
|
|
50
|
+
if isinstance(data, pd.DataFrame):
|
|
51
|
+
raise ValueError("Data cannot be a pandas DataFrame")
|
|
52
|
+
|
|
53
|
+
flow_to_run: Callable = flow
|
|
54
|
+
if os.getenv("AI_EVALS_BATCH_USE_ASYNC", "true").lower() == "true" and isinstance(flow, HasAsyncCallable):
|
|
44
55
|
flow_to_run = flow._to_async() # pylint: disable=protected-access
|
|
45
56
|
|
|
57
|
+
name: str = kwargs.pop("name", "")
|
|
58
|
+
if not name:
|
|
59
|
+
name = f"azure_ai_evaluation_evaluators_{evaluator_name}_{datetime.now().strftime('%Y%m%d_%H%M%S_%f')}"
|
|
60
|
+
|
|
46
61
|
batch_use_async = self._should_batch_use_async(flow_to_run)
|
|
47
62
|
eval_future = self._thread_pool.submit(
|
|
48
63
|
self._pf_client.run,
|
|
49
64
|
flow_to_run,
|
|
50
65
|
data=data,
|
|
51
|
-
column_mapping=column_mapping,
|
|
66
|
+
column_mapping=column_mapping, # type: ignore
|
|
52
67
|
batch_use_async=batch_use_async,
|
|
53
|
-
|
|
68
|
+
name=name,
|
|
69
|
+
**kwargs,
|
|
54
70
|
)
|
|
55
71
|
return ProxyRun(run=eval_future)
|
|
56
72
|
|
|
57
|
-
def get_details(self,
|
|
58
|
-
run: Run =
|
|
73
|
+
def get_details(self, client_run: BatchClientRun, all_results: bool = False) -> pd.DataFrame:
|
|
74
|
+
run: Run = self.get_result(client_run)
|
|
59
75
|
result_df = self._pf_client.get_details(run, all_results=all_results)
|
|
60
76
|
result_df.replace("(Failed)", math.nan, inplace=True)
|
|
61
77
|
return result_df
|
|
62
78
|
|
|
63
|
-
def get_metrics(self,
|
|
64
|
-
run: Run =
|
|
79
|
+
def get_metrics(self, client_run: BatchClientRun) -> Dict[str, Any]:
|
|
80
|
+
run: Run = self.get_result(client_run)
|
|
65
81
|
return self._pf_client.get_metrics(run)
|
|
66
82
|
|
|
67
|
-
def get_run_summary(self,
|
|
68
|
-
run =
|
|
83
|
+
def get_run_summary(self, client_run: BatchClientRun) -> Dict[str, Any]:
|
|
84
|
+
run: Run = self.get_result(client_run)
|
|
69
85
|
|
|
70
86
|
# pylint: disable=protected-access
|
|
71
87
|
completed_lines = run._properties.get("system_metrics", {}).get("__pf__.lines.completed", "NA")
|
|
@@ -81,13 +97,17 @@ class ProxyClient: # pylint: disable=client-accepts-api-version-keyword
|
|
|
81
97
|
return OrderedDict(
|
|
82
98
|
[
|
|
83
99
|
("status", status),
|
|
84
|
-
("duration", str(run._end_time - run._created_on)),
|
|
100
|
+
("duration", str((run._end_time or run._created_on) - run._created_on)),
|
|
85
101
|
("completed_lines", completed_lines),
|
|
86
102
|
("failed_lines", failed_lines),
|
|
87
103
|
("log_path", str(run._output_path)),
|
|
88
104
|
]
|
|
89
105
|
)
|
|
90
106
|
|
|
107
|
+
@staticmethod
|
|
108
|
+
def get_result(run: BatchClientRun) -> Run:
|
|
109
|
+
return cast(ProxyRun, run).run.result()
|
|
110
|
+
|
|
91
111
|
@staticmethod
|
|
92
112
|
def _should_batch_use_async(flow):
|
|
93
113
|
if os.getenv("AI_EVALS_BATCH_USE_ASYNC", "true").lower() == "true":
|
|
@@ -5,7 +5,7 @@ import os
|
|
|
5
5
|
import types
|
|
6
6
|
from typing import Optional, Type
|
|
7
7
|
|
|
8
|
-
from
|
|
8
|
+
from azure.ai.evaluation._legacy._adapters._constants import PF_FLOW_ENTRY_IN_TMP
|
|
9
9
|
from azure.ai.evaluation._constants import PF_DISABLE_TRACING
|
|
10
10
|
|
|
11
11
|
|
|
@@ -13,7 +13,7 @@ import uuid
|
|
|
13
13
|
from typing import Any, Dict, List, Optional, Set, Type
|
|
14
14
|
from urllib.parse import urlparse
|
|
15
15
|
|
|
16
|
-
from
|
|
16
|
+
from azure.ai.evaluation._legacy._adapters.entities import Run
|
|
17
17
|
from typing_extensions import Self
|
|
18
18
|
|
|
19
19
|
from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
|
|
@@ -404,7 +404,7 @@ class EvalRun(contextlib.AbstractContextManager): # pylint: disable=too-many-in
|
|
|
404
404
|
LOGGER.warning("The run results file was not found, skipping artifacts upload.")
|
|
405
405
|
return
|
|
406
406
|
# First we will list the files and the appropriate remote paths for them.
|
|
407
|
-
root_upload_path = posixpath.join("promptflow", "PromptFlowArtifacts", self.info.
|
|
407
|
+
root_upload_path = posixpath.join("promptflow", "PromptFlowArtifacts", self.info.run_id)
|
|
408
408
|
remote_paths: Dict[str, List[Dict[str, str]]] = {"paths": []}
|
|
409
409
|
local_paths = []
|
|
410
410
|
# Go over the artifact folder and upload all artifacts.
|
|
@@ -6,13 +6,11 @@ import json
|
|
|
6
6
|
import logging
|
|
7
7
|
import os
|
|
8
8
|
import re
|
|
9
|
-
from typing import Any, Callable, Dict, List, Optional, Set, Tuple, TypedDict,
|
|
9
|
+
from typing import Any, Callable, Dict, List, Optional, Set, Tuple, TypedDict, Union, cast
|
|
10
10
|
|
|
11
|
+
from azure.ai.evaluation._legacy._adapters._constants import LINE_NUMBER
|
|
12
|
+
from azure.ai.evaluation._legacy._adapters.entities import Run
|
|
11
13
|
import pandas as pd
|
|
12
|
-
from promptflow._sdk._constants import LINE_NUMBER
|
|
13
|
-
from promptflow.client import PFClient
|
|
14
|
-
from promptflow.entities import Run
|
|
15
|
-
from promptflow._sdk._configuration import Configuration
|
|
16
14
|
|
|
17
15
|
from azure.ai.evaluation._common.math import list_mean_nan_safe, apply_transform_nan_safe
|
|
18
16
|
from azure.ai.evaluation._common.utils import validate_azure_ai_project
|
|
@@ -27,7 +25,14 @@ from .._constants import (
|
|
|
27
25
|
)
|
|
28
26
|
from .._model_configurations import AzureAIProject, EvaluationResult, EvaluatorConfig
|
|
29
27
|
from .._user_agent import USER_AGENT
|
|
30
|
-
from ._batch_run import
|
|
28
|
+
from ._batch_run import (
|
|
29
|
+
EvalRunContext,
|
|
30
|
+
CodeClient,
|
|
31
|
+
ProxyClient,
|
|
32
|
+
ProxyRun,
|
|
33
|
+
TargetRunContext,
|
|
34
|
+
RunSubmitterClient,
|
|
35
|
+
)
|
|
31
36
|
from ._utils import (
|
|
32
37
|
_apply_column_mapping,
|
|
33
38
|
_log_metrics_and_instance_results,
|
|
@@ -35,8 +40,8 @@ from ._utils import (
|
|
|
35
40
|
_write_output,
|
|
36
41
|
DataLoaderFactory,
|
|
37
42
|
)
|
|
43
|
+
from ._batch_run.batch_clients import BatchClient
|
|
38
44
|
|
|
39
|
-
TClient = TypeVar("TClient", ProxyClient, CodeClient)
|
|
40
45
|
LOGGER = logging.getLogger(__name__)
|
|
41
46
|
|
|
42
47
|
# For metrics (aggregates) whose metric names intentionally differ from their
|
|
@@ -71,7 +76,7 @@ def _aggregate_other_metrics(df: pd.DataFrame) -> Tuple[List[str], Dict[str, flo
|
|
|
71
76
|
if metric_name in METRIC_COLUMN_NAME_REPLACEMENTS:
|
|
72
77
|
renamed_cols.append(col)
|
|
73
78
|
new_col_name = metric_prefix + "." + METRIC_COLUMN_NAME_REPLACEMENTS[metric_name]
|
|
74
|
-
col_with_numeric_values = pd.to_numeric(df[col], errors="coerce")
|
|
79
|
+
col_with_numeric_values = cast(List[float], pd.to_numeric(df[col], errors="coerce"))
|
|
75
80
|
try:
|
|
76
81
|
metric_columns[new_col_name] = round(list_mean_nan_safe(col_with_numeric_values), 2)
|
|
77
82
|
except EvaluationException: # only exception that can be cause is all NaN values
|
|
@@ -122,7 +127,7 @@ def _aggregate_content_safety_metrics(
|
|
|
122
127
|
defect_rates = {}
|
|
123
128
|
for col in content_safety_df.columns:
|
|
124
129
|
defect_rate_name = col.replace("_score", "_defect_rate")
|
|
125
|
-
col_with_numeric_values = pd.to_numeric(content_safety_df[col], errors="coerce")
|
|
130
|
+
col_with_numeric_values = cast(List[float], pd.to_numeric(content_safety_df[col], errors="coerce"))
|
|
126
131
|
try:
|
|
127
132
|
col_with_boolean_values = apply_transform_nan_safe(
|
|
128
133
|
col_with_numeric_values, lambda x: 1 if x >= CONTENT_SAFETY_DEFECT_RATE_THRESHOLD_DEFAULT else 0
|
|
@@ -152,26 +157,57 @@ def _aggregate_label_defect_metrics(df: pd.DataFrame) -> Tuple[List[str], Dict[s
|
|
|
152
157
|
EvaluationMetrics.LOGOS_AND_BRANDS,
|
|
153
158
|
_InternalEvaluationMetrics.ECI,
|
|
154
159
|
EvaluationMetrics.XPIA,
|
|
160
|
+
EvaluationMetrics.CODE_VULNERABILITY,
|
|
161
|
+
EvaluationMetrics.UNGROUNDED_ATTRIBUTES,
|
|
155
162
|
]
|
|
156
163
|
label_cols = []
|
|
164
|
+
details_cols = []
|
|
157
165
|
for col in df.columns:
|
|
158
166
|
metric_name = col.split(".")[1]
|
|
159
167
|
if metric_name.endswith("_label") and metric_name.replace("_label", "").lower() in handled_metrics:
|
|
160
168
|
label_cols.append(col)
|
|
169
|
+
if metric_name.endswith("_details") and metric_name.replace("_details", "").lower() in handled_metrics:
|
|
170
|
+
details_cols = col
|
|
161
171
|
|
|
162
172
|
label_df = df[label_cols]
|
|
163
173
|
defect_rates = {}
|
|
164
174
|
for col in label_df.columns:
|
|
165
175
|
defect_rate_name = col.replace("_label", "_defect_rate")
|
|
166
|
-
col_with_boolean_values = pd.to_numeric(label_df[col], errors="coerce")
|
|
176
|
+
col_with_boolean_values = cast(List[float], pd.to_numeric(label_df[col], errors="coerce"))
|
|
167
177
|
try:
|
|
168
178
|
defect_rates[defect_rate_name] = round(list_mean_nan_safe(col_with_boolean_values), 2)
|
|
169
179
|
except EvaluationException: # only exception that can be cause is all NaN values
|
|
170
180
|
msg = f"All score evaluations are NaN/None for column {col}. No aggregation can be performed."
|
|
171
181
|
LOGGER.warning(msg)
|
|
182
|
+
|
|
183
|
+
if details_cols:
|
|
184
|
+
details_df = df[details_cols]
|
|
185
|
+
detail_defect_rates = {}
|
|
186
|
+
|
|
187
|
+
for key, value in details_df.items():
|
|
188
|
+
_process_rows(value, detail_defect_rates)
|
|
189
|
+
|
|
190
|
+
for key, value in detail_defect_rates.items():
|
|
191
|
+
col_with_boolean_values = pd.to_numeric(value, errors="coerce")
|
|
192
|
+
try:
|
|
193
|
+
defect_rates[f"{details_cols}.{key}_defect_rate"] = round(
|
|
194
|
+
list_mean_nan_safe(col_with_boolean_values), 2
|
|
195
|
+
)
|
|
196
|
+
except EvaluationException: # only exception that can be cause is all NaN values
|
|
197
|
+
msg = f"All score evaluations are NaN/None for column {key}. No aggregation can be performed."
|
|
198
|
+
LOGGER.warning(msg)
|
|
199
|
+
|
|
172
200
|
return label_cols, defect_rates
|
|
173
201
|
|
|
174
202
|
|
|
203
|
+
def _process_rows(row, detail_defect_rates):
|
|
204
|
+
for key, value in row.items():
|
|
205
|
+
if key not in detail_defect_rates:
|
|
206
|
+
detail_defect_rates[key] = []
|
|
207
|
+
detail_defect_rates[key].append(value)
|
|
208
|
+
return detail_defect_rates
|
|
209
|
+
|
|
210
|
+
|
|
175
211
|
def _aggregate_metrics(df: pd.DataFrame, evaluators: Dict[str, Callable]) -> Dict[str, float]:
|
|
176
212
|
"""Aggregate metrics from the evaluation results.
|
|
177
213
|
On top of naively calculating the mean of most metrics, this function also identifies certain columns
|
|
@@ -303,7 +339,7 @@ def _validate_columns_for_evaluators(
|
|
|
303
339
|
missing_inputs = []
|
|
304
340
|
else:
|
|
305
341
|
optional_params = (
|
|
306
|
-
evaluator._OPTIONAL_PARAMS # pylint: disable=protected-access
|
|
342
|
+
cast(Any, evaluator)._OPTIONAL_PARAMS # pylint: disable=protected-access
|
|
307
343
|
if hasattr(evaluator, "_OPTIONAL_PARAMS")
|
|
308
344
|
else []
|
|
309
345
|
)
|
|
@@ -451,7 +487,7 @@ def _validate_and_load_data(target, data, evaluators, output_path, azure_ai_proj
|
|
|
451
487
|
def _apply_target_to_data(
|
|
452
488
|
target: Callable,
|
|
453
489
|
data: Union[str, os.PathLike],
|
|
454
|
-
batch_client:
|
|
490
|
+
batch_client: BatchClient,
|
|
455
491
|
initial_data: pd.DataFrame,
|
|
456
492
|
evaluation_name: Optional[str] = None,
|
|
457
493
|
**kwargs,
|
|
@@ -472,22 +508,31 @@ def _apply_target_to_data(
|
|
|
472
508
|
:return: The tuple, containing data frame and the list of added columns.
|
|
473
509
|
:rtype: Tuple[pandas.DataFrame, List[str]]
|
|
474
510
|
"""
|
|
511
|
+
|
|
512
|
+
if not isinstance(batch_client, ProxyClient):
|
|
513
|
+
raise ValueError("Only ProxyClient supports target runs for now.")
|
|
514
|
+
|
|
475
515
|
_run_name = kwargs.get("_run_name")
|
|
476
516
|
with TargetRunContext():
|
|
477
|
-
run
|
|
478
|
-
|
|
479
|
-
|
|
480
|
-
|
|
481
|
-
|
|
482
|
-
|
|
517
|
+
run = cast(
|
|
518
|
+
ProxyRun,
|
|
519
|
+
batch_client.run(
|
|
520
|
+
flow=target,
|
|
521
|
+
display_name=evaluation_name,
|
|
522
|
+
data=data,
|
|
523
|
+
stream=True,
|
|
524
|
+
name=_run_name,
|
|
525
|
+
),
|
|
483
526
|
)
|
|
484
527
|
|
|
485
528
|
target_output: pd.DataFrame = batch_client.get_details(run, all_results=True)
|
|
486
529
|
run_summary = batch_client.get_run_summary(run)
|
|
487
530
|
|
|
488
531
|
if run_summary["completed_lines"] == 0:
|
|
489
|
-
msg = (
|
|
490
|
-
|
|
532
|
+
msg = (
|
|
533
|
+
f"Evaluation target failed to produce any results."
|
|
534
|
+
f" Please check the logs at {run_summary['log_path']} for more details about cause of failure."
|
|
535
|
+
)
|
|
491
536
|
raise EvaluationException(
|
|
492
537
|
message=msg,
|
|
493
538
|
target=ErrorTarget.EVALUATE,
|
|
@@ -577,7 +622,6 @@ def _rename_columns_conditionally(df: pd.DataFrame) -> pd.DataFrame:
|
|
|
577
622
|
return df
|
|
578
623
|
|
|
579
624
|
|
|
580
|
-
# @log_evaluate_activity
|
|
581
625
|
def evaluate(
|
|
582
626
|
*,
|
|
583
627
|
data: Union[str, os.PathLike],
|
|
@@ -728,20 +772,24 @@ def _evaluate( # pylint: disable=too-many-locals,too-many-statements
|
|
|
728
772
|
if target is not None:
|
|
729
773
|
_validate_columns_for_target(input_data_df, target)
|
|
730
774
|
|
|
731
|
-
Configuration.get_instance().set_config("trace.destination", "none")
|
|
732
|
-
pf_client = PFClient(user_agent=USER_AGENT)
|
|
733
|
-
target_run: Optional[Run] = None
|
|
734
|
-
|
|
735
775
|
# Create default configuration for evaluators that directly maps
|
|
736
776
|
# input data names to keyword inputs of the same name in the evaluators.
|
|
737
777
|
column_mapping = column_mapping or {}
|
|
738
778
|
column_mapping.setdefault("default", {})
|
|
739
779
|
|
|
740
|
-
|
|
780
|
+
target_run: Optional[Run] = None
|
|
741
781
|
target_generated_columns: Set[str] = set()
|
|
782
|
+
batch_run_client: BatchClient
|
|
783
|
+
batch_run_data: Union[str, os.PathLike, pd.DataFrame] = data
|
|
784
|
+
|
|
785
|
+
# If target is set, apply 1-1 column mapping from target outputs to evaluator inputs
|
|
742
786
|
if data is not None and target is not None:
|
|
787
|
+
# Right now, only the ProxyClient that uses Promptflow supports a target function
|
|
788
|
+
batch_run_client = ProxyClient(user_agent=USER_AGENT)
|
|
789
|
+
batch_run_data = os.path.abspath(data)
|
|
790
|
+
|
|
743
791
|
input_data_df, target_generated_columns, target_run = _apply_target_to_data(
|
|
744
|
-
target, data,
|
|
792
|
+
target, data, batch_run_client, input_data_df, evaluation_name, **kwargs
|
|
745
793
|
)
|
|
746
794
|
|
|
747
795
|
for evaluator_name, mapping in column_mapping.items():
|
|
@@ -755,6 +803,17 @@ def _evaluate( # pylint: disable=too-many-locals,too-many-statements
|
|
|
755
803
|
# customer did not mapped target output.
|
|
756
804
|
if col not in mapping and run_output not in mapped_to_values:
|
|
757
805
|
column_mapping[evaluator_name][col] = run_output # pylint: disable=unnecessary-dict-index-lookup
|
|
806
|
+
elif kwargs.pop("_use_run_submitter_client", False):
|
|
807
|
+
batch_run_client = RunSubmitterClient()
|
|
808
|
+
batch_run_data = input_data_df
|
|
809
|
+
elif kwargs.pop("_use_pf_client", True):
|
|
810
|
+
batch_run_client = ProxyClient(user_agent=USER_AGENT)
|
|
811
|
+
# Ensure the absolute path is passed to pf.run, as relative path doesn't work with
|
|
812
|
+
# multiple evaluators. If the path is already absolute, abspath will return the original path.
|
|
813
|
+
batch_run_data = os.path.abspath(data)
|
|
814
|
+
else:
|
|
815
|
+
batch_run_client = CodeClient()
|
|
816
|
+
batch_run_data = input_data_df
|
|
758
817
|
|
|
759
818
|
# After we have generated all columns, we can check if we have everything we need for evaluators.
|
|
760
819
|
_validate_columns_for_evaluators(input_data_df, evaluators, target, target_generated_columns, column_mapping)
|
|
@@ -770,46 +829,32 @@ def _evaluate( # pylint: disable=too-many-locals,too-many-statements
|
|
|
770
829
|
if not col.startswith(Prefixes.TSG_OUTPUTS) and col not in column_mapping["default"].keys():
|
|
771
830
|
column_mapping["default"][col] = f"${{data.{col}}}"
|
|
772
831
|
|
|
773
|
-
|
|
774
|
-
|
|
775
|
-
|
|
776
|
-
|
|
777
|
-
|
|
778
|
-
|
|
779
|
-
|
|
780
|
-
|
|
781
|
-
|
|
782
|
-
|
|
783
|
-
|
|
784
|
-
|
|
785
|
-
|
|
786
|
-
)
|
|
787
|
-
for evaluator_name, evaluator in evaluators.items()
|
|
788
|
-
}
|
|
832
|
+
with EvalRunContext(batch_run_client):
|
|
833
|
+
runs = {
|
|
834
|
+
evaluator_name: batch_run_client.run(
|
|
835
|
+
flow=evaluator,
|
|
836
|
+
data=batch_run_data,
|
|
837
|
+
run=target_run,
|
|
838
|
+
evaluator_name=evaluator_name,
|
|
839
|
+
column_mapping=column_mapping.get(evaluator_name, column_mapping.get("default", None)),
|
|
840
|
+
stream=True,
|
|
841
|
+
name=kwargs.get("_run_name"),
|
|
842
|
+
)
|
|
843
|
+
for evaluator_name, evaluator in evaluators.items()
|
|
844
|
+
}
|
|
789
845
|
|
|
790
|
-
|
|
791
|
-
|
|
792
|
-
|
|
793
|
-
|
|
794
|
-
|
|
795
|
-
|
|
796
|
-
}
|
|
797
|
-
for evaluator_name, run in runs.items()
|
|
846
|
+
# get_details needs to be called within EvalRunContext scope in order to have user agent populated
|
|
847
|
+
per_evaluator_results: Dict[str, __EvaluatorInfo] = {
|
|
848
|
+
evaluator_name: {
|
|
849
|
+
"result": batch_run_client.get_details(run, all_results=True),
|
|
850
|
+
"metrics": batch_run_client.get_metrics(run),
|
|
851
|
+
"run_summary": batch_run_client.get_run_summary(run),
|
|
798
852
|
}
|
|
799
|
-
|
|
800
|
-
|
|
801
|
-
use_pf_client = kwargs.get("_use_pf_client", True)
|
|
802
|
-
if use_pf_client:
|
|
803
|
-
# Ensure the absolute path is passed to pf.run, as relative path doesn't work with
|
|
804
|
-
# multiple evaluators. If the path is already absolute, abspath will return the original path.
|
|
805
|
-
data = os.path.abspath(data)
|
|
806
|
-
per_evaluator_results = eval_batch_run(ProxyClient(pf_client), data=data)
|
|
807
|
-
else:
|
|
808
|
-
data = input_data_df
|
|
809
|
-
per_evaluator_results = eval_batch_run(CodeClient(), data=input_data_df)
|
|
853
|
+
for evaluator_name, run in runs.items()
|
|
854
|
+
}
|
|
810
855
|
|
|
811
856
|
# Concatenate all results
|
|
812
|
-
evaluators_result_df =
|
|
857
|
+
evaluators_result_df = pd.DataFrame()
|
|
813
858
|
evaluators_metric = {}
|
|
814
859
|
for evaluator_name, evaluator_result in per_evaluator_results.items():
|
|
815
860
|
if fail_on_evaluator_errors and evaluator_result["run_summary"]["failed_lines"] > 0:
|
|
@@ -851,7 +896,7 @@ def _evaluate( # pylint: disable=too-many-locals,too-many-statements
|
|
|
851
896
|
metrics.update(evaluators_metric)
|
|
852
897
|
|
|
853
898
|
# Since tracing is disabled, pass None for target_run so a dummy evaluation run will be created each time.
|
|
854
|
-
target_run = None
|
|
899
|
+
target_run: Optional[Run] = None
|
|
855
900
|
trace_destination = _trace_destination_from_project_scope(azure_ai_project) if azure_ai_project else None
|
|
856
901
|
studio_url = None
|
|
857
902
|
if trace_destination:
|
|
@@ -9,11 +9,10 @@ import logging
|
|
|
9
9
|
from typing import Callable, Dict, Literal, Optional, Union, cast
|
|
10
10
|
|
|
11
11
|
import pandas as pd
|
|
12
|
-
from
|
|
13
|
-
from
|
|
14
|
-
from
|
|
15
|
-
from
|
|
16
|
-
from promptflow.core import Prompty as prompty_core
|
|
12
|
+
from azure.ai.evaluation._legacy._adapters._flows import FlexFlow as flex_flow
|
|
13
|
+
from azure.ai.evaluation._legacy._adapters._flows import AsyncPrompty as prompty_sdk
|
|
14
|
+
from azure.ai.evaluation._legacy._adapters._flows import Flow as dag_flow
|
|
15
|
+
from azure.ai.evaluation._legacy._adapters.client import PFClient
|
|
17
16
|
from typing_extensions import ParamSpec
|
|
18
17
|
|
|
19
18
|
from azure.ai.evaluation._model_configurations import AzureAIProject, EvaluationResult
|
|
@@ -66,7 +65,7 @@ def _get_evaluator_properties(evaluator, evaluator_name):
|
|
|
66
65
|
|
|
67
66
|
try:
|
|
68
67
|
# Cover flex flow and prompty based evaluator
|
|
69
|
-
if isinstance(evaluator, (prompty_sdk,
|
|
68
|
+
if isinstance(evaluator, (prompty_sdk, flex_flow)):
|
|
70
69
|
name = evaluator.name
|
|
71
70
|
pf_type = evaluator.__class__.__name__
|
|
72
71
|
# Cover dag flow based evaluator
|
|
@@ -94,86 +93,3 @@ def _get_evaluator_properties(evaluator, evaluator_name):
|
|
|
94
93
|
"type": _get_evaluator_type(evaluator),
|
|
95
94
|
"alias": evaluator_name if evaluator_name else "",
|
|
96
95
|
}
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
# cspell:ignore isna
|
|
100
|
-
def log_evaluate_activity(func: Callable[P, EvaluationResult]) -> Callable[P, EvaluationResult]:
|
|
101
|
-
"""Decorator to log evaluate activity
|
|
102
|
-
|
|
103
|
-
:param func: The function to be decorated
|
|
104
|
-
:type func: Callable
|
|
105
|
-
:returns: The decorated function
|
|
106
|
-
:rtype: Callable[P, EvaluationResult]
|
|
107
|
-
"""
|
|
108
|
-
|
|
109
|
-
@functools.wraps(func)
|
|
110
|
-
def wrapper(*args: P.args, **kwargs: P.kwargs) -> EvaluationResult:
|
|
111
|
-
from promptflow._sdk._telemetry import ActivityType, log_activity
|
|
112
|
-
from promptflow._sdk._telemetry.telemetry import get_telemetry_logger
|
|
113
|
-
|
|
114
|
-
evaluators = cast(Optional[Dict[str, Callable]], kwargs.get("evaluators", {})) or {}
|
|
115
|
-
azure_ai_project = cast(Optional[AzureAIProject], kwargs.get("azure_ai_project", None))
|
|
116
|
-
|
|
117
|
-
pf_client = PFClient(
|
|
118
|
-
config=(
|
|
119
|
-
{"trace.destination": _trace_destination_from_project_scope(azure_ai_project)}
|
|
120
|
-
if azure_ai_project
|
|
121
|
-
else None
|
|
122
|
-
),
|
|
123
|
-
user_agent=USER_AGENT,
|
|
124
|
-
)
|
|
125
|
-
|
|
126
|
-
trace_destination = pf_client._config.get_trace_destination() # pylint: disable=protected-access
|
|
127
|
-
track_in_cloud = bool(trace_destination) if trace_destination != "none" else False
|
|
128
|
-
evaluate_target = bool(kwargs.get("target", None))
|
|
129
|
-
evaluator_config = bool(kwargs.get("evaluator_config", None))
|
|
130
|
-
custom_dimensions: Dict[str, Union[str, bool]] = {
|
|
131
|
-
"track_in_cloud": track_in_cloud,
|
|
132
|
-
"evaluate_target": evaluate_target,
|
|
133
|
-
"evaluator_config": evaluator_config,
|
|
134
|
-
}
|
|
135
|
-
|
|
136
|
-
with log_activity(
|
|
137
|
-
get_telemetry_logger(),
|
|
138
|
-
"pf.evals.evaluate",
|
|
139
|
-
activity_type=ActivityType.PUBLICAPI,
|
|
140
|
-
user_agent=USER_AGENT,
|
|
141
|
-
custom_dimensions=custom_dimensions,
|
|
142
|
-
):
|
|
143
|
-
result = func(*args, **kwargs)
|
|
144
|
-
|
|
145
|
-
try:
|
|
146
|
-
evaluators_info = []
|
|
147
|
-
for evaluator_name, evaluator in evaluators.items():
|
|
148
|
-
evaluator_info = _get_evaluator_properties(evaluator, evaluator_name)
|
|
149
|
-
try:
|
|
150
|
-
evaluator_df = pd.DataFrame(result.get("rows", [])).filter(
|
|
151
|
-
like=f"outputs.{evaluator_name}", axis=1
|
|
152
|
-
)
|
|
153
|
-
|
|
154
|
-
failed_rows = (
|
|
155
|
-
evaluator_df.shape[0] if evaluator_df.empty else int(evaluator_df.isna().any(axis=1).sum())
|
|
156
|
-
)
|
|
157
|
-
total_rows = evaluator_df.shape[0]
|
|
158
|
-
|
|
159
|
-
evaluator_info["failed_rows"] = failed_rows
|
|
160
|
-
evaluator_info["total_rows"] = total_rows
|
|
161
|
-
except Exception as e: # pylint: disable=broad-exception-caught
|
|
162
|
-
LOGGER.debug("Failed to collect evaluate failed row info for %s: %s", evaluator_name, e)
|
|
163
|
-
evaluators_info.append(evaluator_info)
|
|
164
|
-
|
|
165
|
-
custom_dimensions = {"evaluators_info": json.dumps(evaluators_info)}
|
|
166
|
-
with log_activity(
|
|
167
|
-
get_telemetry_logger(),
|
|
168
|
-
"pf.evals.evaluate_usage_info",
|
|
169
|
-
activity_type=ActivityType.PUBLICAPI,
|
|
170
|
-
user_agent=USER_AGENT,
|
|
171
|
-
custom_dimensions=custom_dimensions,
|
|
172
|
-
):
|
|
173
|
-
pass
|
|
174
|
-
except Exception as e: # pylint: disable=broad-exception-caught
|
|
175
|
-
LOGGER.debug("Failed to collect evaluate usage info: %s", e)
|
|
176
|
-
|
|
177
|
-
return result
|
|
178
|
-
|
|
179
|
-
return wrapper
|
|
@@ -12,7 +12,7 @@ import uuid
|
|
|
12
12
|
import base64
|
|
13
13
|
|
|
14
14
|
import pandas as pd
|
|
15
|
-
from
|
|
15
|
+
from azure.ai.evaluation._legacy._adapters.entities import Run
|
|
16
16
|
|
|
17
17
|
from azure.ai.evaluation._constants import (
|
|
18
18
|
DEFAULT_EVALUATION_RESULTS_FILE_NAME,
|
|
@@ -46,7 +46,7 @@ def is_none(value) -> bool:
|
|
|
46
46
|
def extract_workspace_triad_from_trace_provider( # pylint: disable=name-too-long
|
|
47
47
|
trace_provider: str,
|
|
48
48
|
) -> AzureMLWorkspace:
|
|
49
|
-
from
|
|
49
|
+
from azure.ai.evaluation._legacy._adapters.utils import get_workspace_triad_from_local
|
|
50
50
|
|
|
51
51
|
match = re.match(AZURE_WORKSPACE_REGEX_FORMAT, trace_provider)
|
|
52
52
|
if not match or len(match.groups()) != 5:
|
|
@@ -131,7 +131,7 @@ def _log_metrics_and_instance_results(
|
|
|
131
131
|
metrics: Dict[str, Any],
|
|
132
132
|
instance_results: pd.DataFrame,
|
|
133
133
|
trace_destination: Optional[str],
|
|
134
|
-
run: Run,
|
|
134
|
+
run: Optional[Run],
|
|
135
135
|
evaluation_name: Optional[str],
|
|
136
136
|
**kwargs,
|
|
137
137
|
) -> Optional[str]:
|
|
@@ -8,6 +8,7 @@ from typing_extensions import overload, override
|
|
|
8
8
|
from azure.ai.evaluation._common.utils import nltk_tokenize
|
|
9
9
|
|
|
10
10
|
from azure.ai.evaluation._evaluators._common import EvaluatorBase
|
|
11
|
+
from azure.ai.evaluation._constants import EVALUATION_PASS_FAIL_MAPPING
|
|
11
12
|
|
|
12
13
|
|
|
13
14
|
class BleuScoreEvaluator(EvaluatorBase):
|
|
@@ -22,6 +23,8 @@ class BleuScoreEvaluator(EvaluatorBase):
|
|
|
22
23
|
indicator of quality.
|
|
23
24
|
|
|
24
25
|
The BLEU score ranges from 0 to 1, with higher scores indicating better quality.
|
|
26
|
+
:param threshold: The threshold for the evaluation. Default is 0.5.
|
|
27
|
+
:type threshold: float
|
|
25
28
|
|
|
26
29
|
.. admonition:: Example:
|
|
27
30
|
|
|
@@ -31,17 +34,27 @@ class BleuScoreEvaluator(EvaluatorBase):
|
|
|
31
34
|
:language: python
|
|
32
35
|
:dedent: 8
|
|
33
36
|
:caption: Initialize and call an BleuScoreEvaluator.
|
|
37
|
+
|
|
38
|
+
.. admonition:: Example with Threshold:
|
|
39
|
+
.. literalinclude:: ../samples/evaluation_samples_threshold.py
|
|
40
|
+
:start-after: [START threshold_bleu_score_evaluator]
|
|
41
|
+
:end-before: [END threshold_bleu_score_evaluator]
|
|
42
|
+
:language: python
|
|
43
|
+
:dedent: 8
|
|
44
|
+
:caption: Initialize with threshold and call an BleuScoreEvaluator.
|
|
34
45
|
"""
|
|
35
46
|
|
|
36
47
|
id = "azureml://registries/azureml/models/Bleu-Score-Evaluator/versions/3"
|
|
37
48
|
"""Evaluator identifier, experimental and to be used only with evaluation in cloud."""
|
|
38
49
|
|
|
39
|
-
def __init__(self):
|
|
40
|
-
|
|
50
|
+
def __init__(self, *, threshold=0.5):
|
|
51
|
+
self._threshold = threshold
|
|
52
|
+
self._higher_is_better = True
|
|
53
|
+
super().__init__(threshold=threshold, _higher_is_better=self._higher_is_better)
|
|
41
54
|
|
|
42
55
|
@override
|
|
43
56
|
async def _do_eval(self, eval_input: Dict) -> Dict[str, float]:
|
|
44
|
-
"""Produce a
|
|
57
|
+
"""Produce a bleu score evaluation result.
|
|
45
58
|
|
|
46
59
|
:param eval_input: The input to the evaluation function.
|
|
47
60
|
:type eval_input: Dict
|
|
@@ -56,9 +69,16 @@ class BleuScoreEvaluator(EvaluatorBase):
|
|
|
56
69
|
# NIST Smoothing
|
|
57
70
|
smoothing_function = SmoothingFunction().method4
|
|
58
71
|
score = sentence_bleu([reference_tokens], hypothesis_tokens, smoothing_function=smoothing_function)
|
|
72
|
+
binary_result = False
|
|
73
|
+
if self._higher_is_better:
|
|
74
|
+
binary_result = score >= self._threshold
|
|
75
|
+
else:
|
|
76
|
+
binary_result = score <= self._threshold
|
|
59
77
|
|
|
60
78
|
return {
|
|
61
79
|
"bleu_score": score,
|
|
80
|
+
"bleu_result": EVALUATION_PASS_FAIL_MAPPING[binary_result],
|
|
81
|
+
"bleu_threshold": self._threshold,
|
|
62
82
|
}
|
|
63
83
|
|
|
64
84
|
@overload # type: ignore
|