azure-ai-evaluation 1.0.0__py3-none-any.whl → 1.0.0b2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of azure-ai-evaluation might be problematic. Click here for more details.
- azure/ai/evaluation/__init__.py +5 -31
- azure/ai/evaluation/_common/constants.py +2 -9
- azure/ai/evaluation/_common/rai_service.py +120 -300
- azure/ai/evaluation/_common/utils.py +23 -381
- azure/ai/evaluation/_constants.py +6 -19
- azure/ai/evaluation/_evaluate/{_batch_run → _batch_run_client}/__init__.py +2 -3
- azure/ai/evaluation/_evaluate/{_batch_run/eval_run_context.py → _batch_run_client/batch_run_context.py} +7 -23
- azure/ai/evaluation/_evaluate/{_batch_run → _batch_run_client}/code_client.py +17 -33
- azure/ai/evaluation/_evaluate/{_batch_run → _batch_run_client}/proxy_client.py +4 -32
- azure/ai/evaluation/_evaluate/_eval_run.py +24 -81
- azure/ai/evaluation/_evaluate/_evaluate.py +239 -393
- azure/ai/evaluation/_evaluate/_telemetry/__init__.py +17 -17
- azure/ai/evaluation/_evaluate/_utils.py +28 -82
- azure/ai/evaluation/_evaluators/_bleu/_bleu.py +18 -17
- azure/ai/evaluation/_evaluators/{_retrieval → _chat}/__init__.py +2 -2
- azure/ai/evaluation/_evaluators/_chat/_chat.py +357 -0
- azure/ai/evaluation/_evaluators/{_service_groundedness → _chat/retrieval}/__init__.py +2 -2
- azure/ai/evaluation/_evaluators/_chat/retrieval/_retrieval.py +157 -0
- azure/ai/evaluation/_evaluators/_chat/retrieval/retrieval.prompty +48 -0
- azure/ai/evaluation/_evaluators/_coherence/_coherence.py +88 -78
- azure/ai/evaluation/_evaluators/_coherence/coherence.prompty +39 -76
- azure/ai/evaluation/_evaluators/_content_safety/__init__.py +4 -0
- azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py +67 -105
- azure/ai/evaluation/_evaluators/{_multimodal/_content_safety_multimodal_base.py → _content_safety/_content_safety_base.py} +34 -24
- azure/ai/evaluation/_evaluators/_content_safety/_content_safety_chat.py +301 -0
- azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py +54 -105
- azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py +52 -99
- azure/ai/evaluation/_evaluators/_content_safety/_sexual.py +52 -101
- azure/ai/evaluation/_evaluators/_content_safety/_violence.py +51 -101
- azure/ai/evaluation/_evaluators/_eci/_eci.py +54 -44
- azure/ai/evaluation/_evaluators/_f1_score/_f1_score.py +19 -34
- azure/ai/evaluation/_evaluators/_fluency/_fluency.py +89 -76
- azure/ai/evaluation/_evaluators/_fluency/fluency.prompty +41 -66
- azure/ai/evaluation/_evaluators/_gleu/_gleu.py +16 -14
- azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +87 -113
- azure/ai/evaluation/_evaluators/_groundedness/groundedness.prompty +54 -0
- azure/ai/evaluation/_evaluators/_meteor/_meteor.py +27 -20
- azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py +80 -89
- azure/ai/evaluation/_evaluators/_protected_materials/__init__.py +5 -0
- azure/ai/evaluation/_evaluators/_protected_materials/_protected_materials.py +104 -0
- azure/ai/evaluation/_evaluators/_qa/_qa.py +30 -23
- azure/ai/evaluation/_evaluators/_relevance/_relevance.py +96 -84
- azure/ai/evaluation/_evaluators/_relevance/relevance.prompty +47 -78
- azure/ai/evaluation/_evaluators/_rouge/_rouge.py +27 -26
- azure/ai/evaluation/_evaluators/_similarity/_similarity.py +38 -53
- azure/ai/evaluation/_evaluators/_similarity/similarity.prompty +5 -0
- azure/ai/evaluation/_evaluators/_xpia/xpia.py +105 -91
- azure/ai/evaluation/_exceptions.py +7 -28
- azure/ai/evaluation/_http_utils.py +132 -203
- azure/ai/evaluation/_model_configurations.py +8 -104
- azure/ai/evaluation/_version.py +1 -1
- azure/ai/evaluation/simulator/__init__.py +1 -2
- azure/ai/evaluation/simulator/_adversarial_scenario.py +1 -20
- azure/ai/evaluation/simulator/_adversarial_simulator.py +92 -111
- azure/ai/evaluation/simulator/_constants.py +1 -11
- azure/ai/evaluation/simulator/_conversation/__init__.py +12 -13
- azure/ai/evaluation/simulator/_conversation/_conversation.py +4 -4
- azure/ai/evaluation/simulator/_direct_attack_simulator.py +67 -33
- azure/ai/evaluation/simulator/_helpers/__init__.py +2 -1
- azure/ai/evaluation/{_common → simulator/_helpers}/_experimental.py +9 -24
- azure/ai/evaluation/simulator/_helpers/_simulator_data_classes.py +5 -26
- azure/ai/evaluation/simulator/_indirect_attack_simulator.py +94 -107
- azure/ai/evaluation/simulator/_model_tools/_identity_manager.py +22 -70
- azure/ai/evaluation/simulator/_model_tools/_proxy_completion_model.py +11 -28
- azure/ai/evaluation/simulator/_model_tools/_rai_client.py +4 -8
- azure/ai/evaluation/simulator/_model_tools/_template_handler.py +24 -68
- azure/ai/evaluation/simulator/_model_tools/models.py +10 -10
- azure/ai/evaluation/simulator/_prompty/task_query_response.prompty +10 -6
- azure/ai/evaluation/simulator/_prompty/task_simulate.prompty +5 -6
- azure/ai/evaluation/simulator/_simulator.py +207 -277
- azure/ai/evaluation/simulator/_tracing.py +4 -4
- azure/ai/evaluation/simulator/_utils.py +13 -31
- azure_ai_evaluation-1.0.0b2.dist-info/METADATA +449 -0
- azure_ai_evaluation-1.0.0b2.dist-info/RECORD +99 -0
- {azure_ai_evaluation-1.0.0.dist-info → azure_ai_evaluation-1.0.0b2.dist-info}/WHEEL +1 -1
- azure/ai/evaluation/_common/math.py +0 -89
- azure/ai/evaluation/_evaluate/_batch_run/target_run_context.py +0 -46
- azure/ai/evaluation/_evaluators/_common/__init__.py +0 -13
- azure/ai/evaluation/_evaluators/_common/_base_eval.py +0 -344
- azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +0 -88
- azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py +0 -133
- azure/ai/evaluation/_evaluators/_groundedness/groundedness_with_query.prompty +0 -113
- azure/ai/evaluation/_evaluators/_groundedness/groundedness_without_query.prompty +0 -99
- azure/ai/evaluation/_evaluators/_multimodal/__init__.py +0 -20
- azure/ai/evaluation/_evaluators/_multimodal/_content_safety_multimodal.py +0 -132
- azure/ai/evaluation/_evaluators/_multimodal/_hate_unfairness.py +0 -100
- azure/ai/evaluation/_evaluators/_multimodal/_protected_material.py +0 -124
- azure/ai/evaluation/_evaluators/_multimodal/_self_harm.py +0 -100
- azure/ai/evaluation/_evaluators/_multimodal/_sexual.py +0 -100
- azure/ai/evaluation/_evaluators/_multimodal/_violence.py +0 -100
- azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py +0 -112
- azure/ai/evaluation/_evaluators/_retrieval/retrieval.prompty +0 -93
- azure/ai/evaluation/_evaluators/_service_groundedness/_service_groundedness.py +0 -148
- azure/ai/evaluation/_vendor/__init__.py +0 -3
- azure/ai/evaluation/_vendor/rouge_score/__init__.py +0 -14
- azure/ai/evaluation/_vendor/rouge_score/rouge_scorer.py +0 -328
- azure/ai/evaluation/_vendor/rouge_score/scoring.py +0 -63
- azure/ai/evaluation/_vendor/rouge_score/tokenize.py +0 -63
- azure/ai/evaluation/_vendor/rouge_score/tokenizers.py +0 -53
- azure/ai/evaluation/simulator/_data_sources/__init__.py +0 -3
- azure/ai/evaluation/simulator/_data_sources/grounding.json +0 -1150
- azure_ai_evaluation-1.0.0.dist-info/METADATA +0 -595
- azure_ai_evaluation-1.0.0.dist-info/NOTICE.txt +0 -70
- azure_ai_evaluation-1.0.0.dist-info/RECORD +0 -119
- {azure_ai_evaluation-1.0.0.dist-info → azure_ai_evaluation-1.0.0b2.dist-info}/top_level.txt +0 -0
|
@@ -6,7 +6,7 @@ import functools
|
|
|
6
6
|
import inspect
|
|
7
7
|
import json
|
|
8
8
|
import logging
|
|
9
|
-
from typing import Callable, Dict,
|
|
9
|
+
from typing import Callable, Dict, TypeVar
|
|
10
10
|
|
|
11
11
|
import pandas as pd
|
|
12
12
|
from promptflow._sdk.entities._flows import FlexFlow as flex_flow
|
|
@@ -16,30 +16,31 @@ from promptflow.client import PFClient
|
|
|
16
16
|
from promptflow.core import Prompty as prompty_core
|
|
17
17
|
from typing_extensions import ParamSpec
|
|
18
18
|
|
|
19
|
-
from azure.ai.evaluation._model_configurations import AzureAIProject, EvaluationResult
|
|
20
|
-
|
|
21
19
|
from ..._user_agent import USER_AGENT
|
|
22
20
|
from .._utils import _trace_destination_from_project_scope
|
|
23
21
|
|
|
24
22
|
LOGGER = logging.getLogger(__name__)
|
|
25
23
|
|
|
26
24
|
P = ParamSpec("P")
|
|
25
|
+
R = TypeVar("R")
|
|
27
26
|
|
|
28
27
|
|
|
29
|
-
def _get_evaluator_type(evaluator: Dict[str, Callable])
|
|
28
|
+
def _get_evaluator_type(evaluator: Dict[str, Callable]):
|
|
30
29
|
"""
|
|
31
30
|
Get evaluator type for telemetry.
|
|
32
31
|
|
|
33
32
|
:param evaluator: The evaluator object
|
|
34
33
|
:type evaluator: Dict[str, Callable]
|
|
35
34
|
:return: The evaluator type. Possible values are "built-in", "custom", and "content-safety".
|
|
36
|
-
:rtype:
|
|
35
|
+
:rtype: str
|
|
37
36
|
"""
|
|
38
|
-
|
|
39
|
-
|
|
37
|
+
built_in = False
|
|
38
|
+
content_safety = False
|
|
40
39
|
|
|
41
|
-
|
|
42
|
-
|
|
40
|
+
module = inspect.getmodule(evaluator)
|
|
41
|
+
built_in = module and module.__name__.startswith("azure.ai.evaluation._evaluators.")
|
|
42
|
+
if built_in:
|
|
43
|
+
content_safety = module.__name__.startswith("azure.ai.evaluation._evaluators._content_safety")
|
|
43
44
|
|
|
44
45
|
if content_safety:
|
|
45
46
|
return "content-safety"
|
|
@@ -97,22 +98,22 @@ def _get_evaluator_properties(evaluator, evaluator_name):
|
|
|
97
98
|
|
|
98
99
|
|
|
99
100
|
# cspell:ignore isna
|
|
100
|
-
def log_evaluate_activity(func: Callable[P,
|
|
101
|
+
def log_evaluate_activity(func: Callable[P, R]) -> Callable[P, R]:
|
|
101
102
|
"""Decorator to log evaluate activity
|
|
102
103
|
|
|
103
104
|
:param func: The function to be decorated
|
|
104
105
|
:type func: Callable
|
|
105
106
|
:returns: The decorated function
|
|
106
|
-
:rtype: Callable[P,
|
|
107
|
+
:rtype: Callable[P, R]
|
|
107
108
|
"""
|
|
108
109
|
|
|
109
110
|
@functools.wraps(func)
|
|
110
|
-
def wrapper(*args: P.args, **kwargs: P.kwargs) ->
|
|
111
|
+
def wrapper(*args: P.args, **kwargs: P.kwargs) -> R:
|
|
111
112
|
from promptflow._sdk._telemetry import ActivityType, log_activity
|
|
112
113
|
from promptflow._sdk._telemetry.telemetry import get_telemetry_logger
|
|
113
114
|
|
|
114
|
-
evaluators =
|
|
115
|
-
azure_ai_project =
|
|
115
|
+
evaluators = kwargs.get("evaluators", [])
|
|
116
|
+
azure_ai_project = kwargs.get("azure_ai_project", None)
|
|
116
117
|
|
|
117
118
|
pf_client = PFClient(
|
|
118
119
|
config=(
|
|
@@ -123,11 +124,10 @@ def log_evaluate_activity(func: Callable[P, EvaluationResult]) -> Callable[P, Ev
|
|
|
123
124
|
user_agent=USER_AGENT,
|
|
124
125
|
)
|
|
125
126
|
|
|
126
|
-
|
|
127
|
-
track_in_cloud = bool(trace_destination) if trace_destination != "none" else False
|
|
127
|
+
track_in_cloud = bool(pf_client._config.get_trace_destination()) # pylint: disable=protected-access
|
|
128
128
|
evaluate_target = bool(kwargs.get("target", None))
|
|
129
129
|
evaluator_config = bool(kwargs.get("evaluator_config", None))
|
|
130
|
-
custom_dimensions
|
|
130
|
+
custom_dimensions = {
|
|
131
131
|
"track_in_cloud": track_in_cloud,
|
|
132
132
|
"evaluate_target": evaluate_target,
|
|
133
133
|
"evaluator_config": evaluator_config,
|
|
@@ -6,23 +6,15 @@ import logging
|
|
|
6
6
|
import os
|
|
7
7
|
import re
|
|
8
8
|
import tempfile
|
|
9
|
+
from collections import namedtuple
|
|
9
10
|
from pathlib import Path
|
|
10
|
-
from typing import
|
|
11
|
-
import uuid
|
|
12
|
-
import base64
|
|
11
|
+
from typing import Dict
|
|
13
12
|
|
|
14
13
|
import pandas as pd
|
|
15
|
-
|
|
16
|
-
from
|
|
17
|
-
|
|
18
|
-
from azure.ai.evaluation._constants import (
|
|
19
|
-
DEFAULT_EVALUATION_RESULTS_FILE_NAME,
|
|
20
|
-
DefaultOpenEncoding,
|
|
21
|
-
EvaluationRunProperties,
|
|
22
|
-
Prefixes,
|
|
23
|
-
)
|
|
14
|
+
|
|
15
|
+
from azure.ai.evaluation._constants import DEFAULT_EVALUATION_RESULTS_FILE_NAME, DefaultOpenEncoding, Prefixes
|
|
16
|
+
from azure.ai.evaluation._evaluate._eval_run import EvalRun
|
|
24
17
|
from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
|
|
25
|
-
from azure.ai.evaluation._model_configurations import AzureAIProject
|
|
26
18
|
|
|
27
19
|
LOGGER = logging.getLogger(__name__)
|
|
28
20
|
|
|
@@ -31,20 +23,14 @@ AZURE_WORKSPACE_REGEX_FORMAT = (
|
|
|
31
23
|
"(/providers/Microsoft.MachineLearningServices)?/workspaces/([^/]+)$"
|
|
32
24
|
)
|
|
33
25
|
|
|
34
|
-
|
|
35
|
-
class AzureMLWorkspace(NamedTuple):
|
|
36
|
-
subscription_id: str
|
|
37
|
-
resource_group_name: str
|
|
38
|
-
workspace_name: str
|
|
26
|
+
AzureMLWorkspaceTriad = namedtuple("AzureMLWorkspace", ["subscription_id", "resource_group_name", "workspace_name"])
|
|
39
27
|
|
|
40
28
|
|
|
41
|
-
def is_none(value)
|
|
29
|
+
def is_none(value):
|
|
42
30
|
return value is None or str(value).lower() == "none"
|
|
43
31
|
|
|
44
32
|
|
|
45
|
-
def extract_workspace_triad_from_trace_provider( # pylint: disable=name-too-long
|
|
46
|
-
trace_provider: str,
|
|
47
|
-
) -> AzureMLWorkspace:
|
|
33
|
+
def extract_workspace_triad_from_trace_provider(trace_provider: str): # pylint: disable=name-too-long
|
|
48
34
|
match = re.match(AZURE_WORKSPACE_REGEX_FORMAT, trace_provider)
|
|
49
35
|
if not match or len(match.groups()) != 5:
|
|
50
36
|
raise EvaluationException(
|
|
@@ -61,7 +47,7 @@ def extract_workspace_triad_from_trace_provider( # pylint: disable=name-too-lon
|
|
|
61
47
|
subscription_id = match.group(1)
|
|
62
48
|
resource_group_name = match.group(3)
|
|
63
49
|
workspace_name = match.group(5)
|
|
64
|
-
return
|
|
50
|
+
return AzureMLWorkspaceTriad(subscription_id, resource_group_name, workspace_name)
|
|
65
51
|
|
|
66
52
|
|
|
67
53
|
def load_jsonl(path):
|
|
@@ -69,7 +55,7 @@ def load_jsonl(path):
|
|
|
69
55
|
return [json.loads(line) for line in f.readlines()]
|
|
70
56
|
|
|
71
57
|
|
|
72
|
-
def _azure_pf_client_and_triad(trace_destination)
|
|
58
|
+
def _azure_pf_client_and_triad(trace_destination):
|
|
73
59
|
from promptflow.azure._cli._utils import _get_azure_pf_client
|
|
74
60
|
|
|
75
61
|
ws_triad = extract_workspace_triad_from_trace_provider(trace_destination)
|
|
@@ -82,45 +68,15 @@ def _azure_pf_client_and_triad(trace_destination) -> Tuple[PFClient, AzureMLWork
|
|
|
82
68
|
return azure_pf_client, ws_triad
|
|
83
69
|
|
|
84
70
|
|
|
85
|
-
def _store_multimodal_content(messages, tmpdir: str):
|
|
86
|
-
# verify if images folder exists
|
|
87
|
-
images_folder_path = os.path.join(tmpdir, "images")
|
|
88
|
-
os.makedirs(images_folder_path, exist_ok=True)
|
|
89
|
-
|
|
90
|
-
# traverse all messages and replace base64 image data with new file name.
|
|
91
|
-
for message in messages:
|
|
92
|
-
if isinstance(message.get("content", []), list):
|
|
93
|
-
for content in message.get("content", []):
|
|
94
|
-
if content.get("type") == "image_url":
|
|
95
|
-
image_url = content.get("image_url")
|
|
96
|
-
if image_url and "url" in image_url and image_url["url"].startswith("data:image/jpg;base64,"):
|
|
97
|
-
# Extract the base64 string
|
|
98
|
-
base64image = image_url["url"].replace("data:image/jpg;base64,", "")
|
|
99
|
-
|
|
100
|
-
# Generate a unique filename
|
|
101
|
-
image_file_name = f"{str(uuid.uuid4())}.jpg"
|
|
102
|
-
image_url["url"] = f"images/{image_file_name}" # Replace the base64 URL with the file path
|
|
103
|
-
|
|
104
|
-
# Decode the base64 string to binary image data
|
|
105
|
-
image_data_binary = base64.b64decode(base64image)
|
|
106
|
-
|
|
107
|
-
# Write the binary image data to the file
|
|
108
|
-
image_file_path = os.path.join(images_folder_path, image_file_name)
|
|
109
|
-
with open(image_file_path, "wb") as f:
|
|
110
|
-
f.write(image_data_binary)
|
|
111
|
-
|
|
112
|
-
|
|
113
71
|
def _log_metrics_and_instance_results(
|
|
114
|
-
metrics
|
|
115
|
-
instance_results
|
|
116
|
-
trace_destination
|
|
117
|
-
run
|
|
118
|
-
evaluation_name
|
|
119
|
-
) ->
|
|
120
|
-
from azure.ai.evaluation._evaluate._eval_run import EvalRun
|
|
121
|
-
|
|
72
|
+
metrics,
|
|
73
|
+
instance_results,
|
|
74
|
+
trace_destination,
|
|
75
|
+
run,
|
|
76
|
+
evaluation_name,
|
|
77
|
+
) -> str:
|
|
122
78
|
if trace_destination is None:
|
|
123
|
-
LOGGER.
|
|
79
|
+
LOGGER.error("Unable to log traces as trace destination was not defined.")
|
|
124
80
|
return None
|
|
125
81
|
|
|
126
82
|
azure_pf_client, ws_triad = _azure_pf_client_and_triad(trace_destination)
|
|
@@ -138,18 +94,10 @@ def _log_metrics_and_instance_results(
|
|
|
138
94
|
ml_client=azure_pf_client.ml_client,
|
|
139
95
|
promptflow_run=run,
|
|
140
96
|
) as ev_run:
|
|
141
|
-
|
|
97
|
+
|
|
98
|
+
artifact_name = EvalRun.EVALUATION_ARTIFACT if run else EvalRun.EVALUATION_ARTIFACT_DUMMY_RUN
|
|
142
99
|
|
|
143
100
|
with tempfile.TemporaryDirectory() as tmpdir:
|
|
144
|
-
# storing multi_modal images if exists
|
|
145
|
-
col_name = "inputs.conversation"
|
|
146
|
-
if col_name in instance_results.columns:
|
|
147
|
-
for item in instance_results[col_name].items():
|
|
148
|
-
value = item[1]
|
|
149
|
-
if "messages" in value:
|
|
150
|
-
_store_multimodal_content(value["messages"], tmpdir)
|
|
151
|
-
|
|
152
|
-
# storing artifact result
|
|
153
101
|
tmp_path = os.path.join(tmpdir, artifact_name)
|
|
154
102
|
|
|
155
103
|
with open(tmp_path, "w", encoding=DefaultOpenEncoding.WRITE) as f:
|
|
@@ -164,9 +112,9 @@ def _log_metrics_and_instance_results(
|
|
|
164
112
|
if run is None:
|
|
165
113
|
ev_run.write_properties_to_run_history(
|
|
166
114
|
properties={
|
|
167
|
-
|
|
168
|
-
EvaluationRunProperties.EVALUATION_RUN: "promptflow.BatchRun",
|
|
115
|
+
"_azureml.evaluation_run": "azure-ai-generative-parent",
|
|
169
116
|
"_azureml.evaluate_artifacts": json.dumps([{"path": artifact_name, "type": "table"}]),
|
|
117
|
+
"isEvaluatorRun": "true",
|
|
170
118
|
}
|
|
171
119
|
)
|
|
172
120
|
|
|
@@ -190,7 +138,7 @@ def _get_ai_studio_url(trace_destination: str, evaluation_id: str) -> str:
|
|
|
190
138
|
return studio_url
|
|
191
139
|
|
|
192
140
|
|
|
193
|
-
def _trace_destination_from_project_scope(project_scope:
|
|
141
|
+
def _trace_destination_from_project_scope(project_scope: dict) -> str:
|
|
194
142
|
subscription_id = project_scope["subscription_id"]
|
|
195
143
|
resource_group_name = project_scope["resource_group_name"]
|
|
196
144
|
workspace_name = project_scope["project_name"]
|
|
@@ -203,19 +151,17 @@ def _trace_destination_from_project_scope(project_scope: AzureAIProject) -> str:
|
|
|
203
151
|
return trace_destination
|
|
204
152
|
|
|
205
153
|
|
|
206
|
-
def _write_output(path
|
|
154
|
+
def _write_output(path, data_dict):
|
|
207
155
|
p = Path(path)
|
|
208
|
-
if
|
|
156
|
+
if os.path.isdir(path):
|
|
209
157
|
p = p / DEFAULT_EVALUATION_RESULTS_FILE_NAME
|
|
210
158
|
|
|
211
159
|
with open(p, "w", encoding=DefaultOpenEncoding.WRITE) as f:
|
|
212
160
|
json.dump(data_dict, f)
|
|
213
161
|
|
|
214
|
-
print(f'Evaluation results saved to "{p.resolve()}".\n')
|
|
215
|
-
|
|
216
162
|
|
|
217
163
|
def _apply_column_mapping(
|
|
218
|
-
source_df: pd.DataFrame, mapping_config:
|
|
164
|
+
source_df: pd.DataFrame, mapping_config: Dict[str, str], inplace: bool = False
|
|
219
165
|
) -> pd.DataFrame:
|
|
220
166
|
"""
|
|
221
167
|
Apply column mapping to source_df based on mapping_config.
|
|
@@ -265,7 +211,7 @@ def _apply_column_mapping(
|
|
|
265
211
|
return result_df
|
|
266
212
|
|
|
267
213
|
|
|
268
|
-
def _has_aggregator(evaluator
|
|
214
|
+
def _has_aggregator(evaluator):
|
|
269
215
|
return hasattr(evaluator, "__aggregate__")
|
|
270
216
|
|
|
271
217
|
|
|
@@ -288,11 +234,11 @@ def get_int_env_var(env_var_name: str, default_value: int) -> int:
|
|
|
288
234
|
return default_value
|
|
289
235
|
|
|
290
236
|
|
|
291
|
-
def set_event_loop_policy()
|
|
237
|
+
def set_event_loop_policy():
|
|
292
238
|
import asyncio
|
|
293
239
|
import platform
|
|
294
240
|
|
|
295
241
|
if platform.system().lower() == "windows":
|
|
296
242
|
# Reference: https://stackoverflow.com/questions/45600579/asyncio-event-loop-is-closed-when-getting-loop
|
|
297
243
|
# On Windows seems to be a problem with EventLoopPolicy, use this snippet to work around it
|
|
298
|
-
asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())
|
|
244
|
+
asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())
|
|
@@ -26,29 +26,30 @@ class _AsyncBleuScoreEvaluator:
|
|
|
26
26
|
|
|
27
27
|
class BleuScoreEvaluator:
|
|
28
28
|
"""
|
|
29
|
-
|
|
29
|
+
Evaluator that computes the BLEU Score between two strings.
|
|
30
30
|
|
|
31
31
|
BLEU (Bilingual Evaluation Understudy) score is commonly used in natural language processing (NLP) and machine
|
|
32
|
-
translation. It is widely used in text summarization and text generation use cases.
|
|
32
|
+
translation. It is widely used in text summarization and text generation use cases. It evaluates how closely the
|
|
33
|
+
generated text matches the reference text. The BLEU score ranges from 0 to 1, with higher scores indicating
|
|
34
|
+
better quality.
|
|
33
35
|
|
|
34
|
-
|
|
35
|
-
especially in tasks such as machine translation or text summarization, where n-gram overlap is a significant
|
|
36
|
-
indicator of quality.
|
|
36
|
+
**Usage**
|
|
37
37
|
|
|
38
|
-
|
|
38
|
+
.. code-block:: python
|
|
39
39
|
|
|
40
|
-
|
|
40
|
+
eval_fn = BleuScoreEvaluator()
|
|
41
|
+
result = eval_fn(
|
|
42
|
+
response="Tokyo is the capital of Japan.",
|
|
43
|
+
ground_truth="The capital of Japan is Tokyo.")
|
|
41
44
|
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
:language: python
|
|
46
|
-
:dedent: 8
|
|
47
|
-
:caption: Initialize and call an BleuScoreEvaluator.
|
|
48
|
-
"""
|
|
45
|
+
**Output format**
|
|
46
|
+
|
|
47
|
+
.. code-block:: python
|
|
49
48
|
|
|
50
|
-
|
|
51
|
-
|
|
49
|
+
{
|
|
50
|
+
"bleu_score": 0.22
|
|
51
|
+
}
|
|
52
|
+
"""
|
|
52
53
|
|
|
53
54
|
def __init__(self):
|
|
54
55
|
self._async_evaluator = _AsyncBleuScoreEvaluator()
|
|
@@ -62,7 +63,7 @@ class BleuScoreEvaluator:
|
|
|
62
63
|
:keyword ground_truth: The ground truth to be compared against.
|
|
63
64
|
:paramtype ground_truth: str
|
|
64
65
|
:return: The BLEU score.
|
|
65
|
-
:rtype:
|
|
66
|
+
:rtype: dict
|
|
66
67
|
"""
|
|
67
68
|
return async_run_allowing_running_loop(
|
|
68
69
|
self._async_evaluator, response=response, ground_truth=ground_truth, **kwargs
|
|
@@ -2,8 +2,8 @@
|
|
|
2
2
|
# Copyright (c) Microsoft Corporation. All rights reserved.
|
|
3
3
|
# ---------------------------------------------------------
|
|
4
4
|
|
|
5
|
-
from .
|
|
5
|
+
from ._chat import ChatEvaluator
|
|
6
6
|
|
|
7
7
|
__all__ = [
|
|
8
|
-
"
|
|
8
|
+
"ChatEvaluator",
|
|
9
9
|
]
|