azure-ai-evaluation 1.0.0__py3-none-any.whl → 1.0.0b1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of azure-ai-evaluation might be problematic. Click here for more details.
- azure/ai/evaluation/__init__.py +4 -26
- azure/ai/evaluation/_common/constants.py +2 -9
- azure/ai/evaluation/_common/rai_service.py +122 -302
- azure/ai/evaluation/_common/utils.py +35 -393
- azure/ai/evaluation/_constants.py +6 -28
- azure/ai/evaluation/_evaluate/{_batch_run → _batch_run_client}/__init__.py +2 -3
- azure/ai/evaluation/_evaluate/{_batch_run/eval_run_context.py → _batch_run_client/batch_run_context.py} +8 -25
- azure/ai/evaluation/_evaluate/{_batch_run → _batch_run_client}/code_client.py +30 -68
- azure/ai/evaluation/_evaluate/_batch_run_client/proxy_client.py +61 -0
- azure/ai/evaluation/_evaluate/_eval_run.py +40 -117
- azure/ai/evaluation/_evaluate/_evaluate.py +255 -416
- azure/ai/evaluation/_evaluate/_telemetry/__init__.py +19 -24
- azure/ai/evaluation/_evaluate/_utils.py +47 -108
- azure/ai/evaluation/_evaluators/_bleu/_bleu.py +19 -18
- azure/ai/evaluation/_evaluators/{_retrieval → _chat}/__init__.py +2 -2
- azure/ai/evaluation/_evaluators/_chat/_chat.py +350 -0
- azure/ai/evaluation/_evaluators/{_service_groundedness → _chat/retrieval}/__init__.py +2 -2
- azure/ai/evaluation/_evaluators/_chat/retrieval/_retrieval.py +163 -0
- azure/ai/evaluation/_evaluators/_chat/retrieval/retrieval.prompty +48 -0
- azure/ai/evaluation/_evaluators/_coherence/_coherence.py +93 -78
- azure/ai/evaluation/_evaluators/_coherence/coherence.prompty +39 -76
- azure/ai/evaluation/_evaluators/_content_safety/__init__.py +4 -0
- azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py +68 -104
- azure/ai/evaluation/_evaluators/{_multimodal/_content_safety_multimodal_base.py → _content_safety/_content_safety_base.py} +35 -24
- azure/ai/evaluation/_evaluators/_content_safety/_content_safety_chat.py +296 -0
- azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py +54 -105
- azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py +52 -99
- azure/ai/evaluation/_evaluators/_content_safety/_sexual.py +52 -101
- azure/ai/evaluation/_evaluators/_content_safety/_violence.py +51 -101
- azure/ai/evaluation/_evaluators/_eci/_eci.py +55 -45
- azure/ai/evaluation/_evaluators/_f1_score/_f1_score.py +20 -36
- azure/ai/evaluation/_evaluators/_fluency/_fluency.py +94 -76
- azure/ai/evaluation/_evaluators/_fluency/fluency.prompty +41 -66
- azure/ai/evaluation/_evaluators/_gleu/_gleu.py +17 -15
- azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +92 -113
- azure/ai/evaluation/_evaluators/_groundedness/groundedness.prompty +54 -0
- azure/ai/evaluation/_evaluators/_meteor/_meteor.py +27 -21
- azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py +80 -89
- azure/ai/evaluation/_evaluators/_protected_materials/__init__.py +5 -0
- azure/ai/evaluation/_evaluators/_protected_materials/_protected_materials.py +104 -0
- azure/ai/evaluation/_evaluators/_qa/_qa.py +43 -25
- azure/ai/evaluation/_evaluators/_relevance/_relevance.py +101 -84
- azure/ai/evaluation/_evaluators/_relevance/relevance.prompty +47 -78
- azure/ai/evaluation/_evaluators/_rouge/_rouge.py +27 -27
- azure/ai/evaluation/_evaluators/_similarity/_similarity.py +45 -55
- azure/ai/evaluation/_evaluators/_similarity/similarity.prompty +5 -0
- azure/ai/evaluation/_evaluators/_xpia/xpia.py +106 -91
- azure/ai/evaluation/_exceptions.py +7 -28
- azure/ai/evaluation/_http_utils.py +134 -205
- azure/ai/evaluation/_model_configurations.py +8 -104
- azure/ai/evaluation/_version.py +1 -1
- azure/ai/evaluation/simulator/__init__.py +2 -3
- azure/ai/evaluation/simulator/_adversarial_scenario.py +1 -20
- azure/ai/evaluation/simulator/_adversarial_simulator.py +95 -116
- azure/ai/evaluation/simulator/_constants.py +1 -11
- azure/ai/evaluation/simulator/_conversation/__init__.py +13 -14
- azure/ai/evaluation/simulator/_conversation/_conversation.py +20 -20
- azure/ai/evaluation/simulator/_direct_attack_simulator.py +68 -34
- azure/ai/evaluation/simulator/_helpers/__init__.py +1 -1
- azure/ai/evaluation/simulator/_helpers/_simulator_data_classes.py +28 -31
- azure/ai/evaluation/simulator/_indirect_attack_simulator.py +95 -108
- azure/ai/evaluation/simulator/_model_tools/_identity_manager.py +22 -70
- azure/ai/evaluation/simulator/_model_tools/_proxy_completion_model.py +14 -30
- azure/ai/evaluation/simulator/_model_tools/_rai_client.py +14 -25
- azure/ai/evaluation/simulator/_model_tools/_template_handler.py +24 -68
- azure/ai/evaluation/simulator/_model_tools/models.py +21 -19
- azure/ai/evaluation/simulator/_prompty/task_query_response.prompty +10 -6
- azure/ai/evaluation/simulator/_prompty/task_simulate.prompty +5 -6
- azure/ai/evaluation/simulator/_tracing.py +28 -25
- azure/ai/evaluation/simulator/_utils.py +13 -34
- azure/ai/evaluation/simulator/simulator.py +579 -0
- azure_ai_evaluation-1.0.0b1.dist-info/METADATA +377 -0
- azure_ai_evaluation-1.0.0b1.dist-info/RECORD +97 -0
- {azure_ai_evaluation-1.0.0.dist-info → azure_ai_evaluation-1.0.0b1.dist-info}/WHEEL +1 -1
- azure/ai/evaluation/_common/_experimental.py +0 -172
- azure/ai/evaluation/_common/math.py +0 -89
- azure/ai/evaluation/_evaluate/_batch_run/proxy_client.py +0 -99
- azure/ai/evaluation/_evaluate/_batch_run/target_run_context.py +0 -46
- azure/ai/evaluation/_evaluators/_common/__init__.py +0 -13
- azure/ai/evaluation/_evaluators/_common/_base_eval.py +0 -344
- azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +0 -88
- azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py +0 -133
- azure/ai/evaluation/_evaluators/_groundedness/groundedness_with_query.prompty +0 -113
- azure/ai/evaluation/_evaluators/_groundedness/groundedness_without_query.prompty +0 -99
- azure/ai/evaluation/_evaluators/_multimodal/__init__.py +0 -20
- azure/ai/evaluation/_evaluators/_multimodal/_content_safety_multimodal.py +0 -132
- azure/ai/evaluation/_evaluators/_multimodal/_hate_unfairness.py +0 -100
- azure/ai/evaluation/_evaluators/_multimodal/_protected_material.py +0 -124
- azure/ai/evaluation/_evaluators/_multimodal/_self_harm.py +0 -100
- azure/ai/evaluation/_evaluators/_multimodal/_sexual.py +0 -100
- azure/ai/evaluation/_evaluators/_multimodal/_violence.py +0 -100
- azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py +0 -112
- azure/ai/evaluation/_evaluators/_retrieval/retrieval.prompty +0 -93
- azure/ai/evaluation/_evaluators/_service_groundedness/_service_groundedness.py +0 -148
- azure/ai/evaluation/_vendor/__init__.py +0 -3
- azure/ai/evaluation/_vendor/rouge_score/__init__.py +0 -14
- azure/ai/evaluation/_vendor/rouge_score/rouge_scorer.py +0 -328
- azure/ai/evaluation/_vendor/rouge_score/scoring.py +0 -63
- azure/ai/evaluation/_vendor/rouge_score/tokenize.py +0 -63
- azure/ai/evaluation/_vendor/rouge_score/tokenizers.py +0 -53
- azure/ai/evaluation/simulator/_data_sources/__init__.py +0 -3
- azure/ai/evaluation/simulator/_data_sources/grounding.json +0 -1150
- azure/ai/evaluation/simulator/_prompty/__init__.py +0 -0
- azure/ai/evaluation/simulator/_simulator.py +0 -716
- azure_ai_evaluation-1.0.0.dist-info/METADATA +0 -595
- azure_ai_evaluation-1.0.0.dist-info/NOTICE.txt +0 -70
- azure_ai_evaluation-1.0.0.dist-info/RECORD +0 -119
- {azure_ai_evaluation-1.0.0.dist-info → azure_ai_evaluation-1.0.0b1.dist-info}/top_level.txt +0 -0
|
@@ -6,40 +6,38 @@ import functools
|
|
|
6
6
|
import inspect
|
|
7
7
|
import json
|
|
8
8
|
import logging
|
|
9
|
-
from typing import Callable, Dict
|
|
9
|
+
from typing import Callable, Dict
|
|
10
10
|
|
|
11
11
|
import pandas as pd
|
|
12
|
+
|
|
12
13
|
from promptflow._sdk.entities._flows import FlexFlow as flex_flow
|
|
13
14
|
from promptflow._sdk.entities._flows import Prompty as prompty_sdk
|
|
14
15
|
from promptflow._sdk.entities._flows.dag import Flow as dag_flow
|
|
15
16
|
from promptflow.client import PFClient
|
|
16
17
|
from promptflow.core import Prompty as prompty_core
|
|
17
|
-
from typing_extensions import ParamSpec
|
|
18
|
-
|
|
19
|
-
from azure.ai.evaluation._model_configurations import AzureAIProject, EvaluationResult
|
|
20
18
|
|
|
21
19
|
from ..._user_agent import USER_AGENT
|
|
22
20
|
from .._utils import _trace_destination_from_project_scope
|
|
23
21
|
|
|
24
22
|
LOGGER = logging.getLogger(__name__)
|
|
25
23
|
|
|
26
|
-
P = ParamSpec("P")
|
|
27
|
-
|
|
28
24
|
|
|
29
|
-
def _get_evaluator_type(evaluator: Dict[str, Callable])
|
|
25
|
+
def _get_evaluator_type(evaluator: Dict[str, Callable]):
|
|
30
26
|
"""
|
|
31
27
|
Get evaluator type for telemetry.
|
|
32
28
|
|
|
33
29
|
:param evaluator: The evaluator object
|
|
34
30
|
:type evaluator: Dict[str, Callable]
|
|
35
31
|
:return: The evaluator type. Possible values are "built-in", "custom", and "content-safety".
|
|
36
|
-
:rtype:
|
|
32
|
+
:rtype: str
|
|
37
33
|
"""
|
|
38
|
-
|
|
39
|
-
|
|
34
|
+
built_in = False
|
|
35
|
+
content_safety = False
|
|
40
36
|
|
|
41
|
-
|
|
42
|
-
|
|
37
|
+
module = inspect.getmodule(evaluator)
|
|
38
|
+
built_in = module and module.__name__.startswith("azure.ai.evaluation._evaluators.")
|
|
39
|
+
if built_in:
|
|
40
|
+
content_safety = module.__name__.startswith("azure.ai.evaluation._evaluators._content_safety")
|
|
43
41
|
|
|
44
42
|
if content_safety:
|
|
45
43
|
return "content-safety"
|
|
@@ -84,7 +82,7 @@ def _get_evaluator_properties(evaluator, evaluator_name):
|
|
|
84
82
|
name = str(evaluator)
|
|
85
83
|
pf_type = "Unknown"
|
|
86
84
|
except Exception as e: # pylint: disable=broad-exception-caught
|
|
87
|
-
LOGGER.debug("Failed to get evaluator properties:
|
|
85
|
+
LOGGER.debug(f"Failed to get evaluator properties: {e}")
|
|
88
86
|
name = str(evaluator)
|
|
89
87
|
pf_type = "Unknown"
|
|
90
88
|
|
|
@@ -97,22 +95,20 @@ def _get_evaluator_properties(evaluator, evaluator_name):
|
|
|
97
95
|
|
|
98
96
|
|
|
99
97
|
# cspell:ignore isna
|
|
100
|
-
def log_evaluate_activity(func
|
|
98
|
+
def log_evaluate_activity(func) -> None:
|
|
101
99
|
"""Decorator to log evaluate activity
|
|
102
100
|
|
|
103
101
|
:param func: The function to be decorated
|
|
104
102
|
:type func: Callable
|
|
105
|
-
:returns: The decorated function
|
|
106
|
-
:rtype: Callable[P, EvaluationResult]
|
|
107
103
|
"""
|
|
108
104
|
|
|
109
105
|
@functools.wraps(func)
|
|
110
|
-
def wrapper(*args
|
|
106
|
+
def wrapper(*args, **kwargs) -> Callable:
|
|
111
107
|
from promptflow._sdk._telemetry import ActivityType, log_activity
|
|
112
108
|
from promptflow._sdk._telemetry.telemetry import get_telemetry_logger
|
|
113
109
|
|
|
114
|
-
evaluators =
|
|
115
|
-
azure_ai_project =
|
|
110
|
+
evaluators = kwargs.get("evaluators", [])
|
|
111
|
+
azure_ai_project = kwargs.get("azure_ai_project", None)
|
|
116
112
|
|
|
117
113
|
pf_client = PFClient(
|
|
118
114
|
config=(
|
|
@@ -123,11 +119,10 @@ def log_evaluate_activity(func: Callable[P, EvaluationResult]) -> Callable[P, Ev
|
|
|
123
119
|
user_agent=USER_AGENT,
|
|
124
120
|
)
|
|
125
121
|
|
|
126
|
-
|
|
127
|
-
track_in_cloud = bool(trace_destination) if trace_destination != "none" else False
|
|
122
|
+
track_in_cloud = bool(pf_client._config.get_trace_destination())
|
|
128
123
|
evaluate_target = bool(kwargs.get("target", None))
|
|
129
124
|
evaluator_config = bool(kwargs.get("evaluator_config", None))
|
|
130
|
-
custom_dimensions
|
|
125
|
+
custom_dimensions = {
|
|
131
126
|
"track_in_cloud": track_in_cloud,
|
|
132
127
|
"evaluate_target": evaluate_target,
|
|
133
128
|
"evaluator_config": evaluator_config,
|
|
@@ -159,7 +154,7 @@ def log_evaluate_activity(func: Callable[P, EvaluationResult]) -> Callable[P, Ev
|
|
|
159
154
|
evaluator_info["failed_rows"] = failed_rows
|
|
160
155
|
evaluator_info["total_rows"] = total_rows
|
|
161
156
|
except Exception as e: # pylint: disable=broad-exception-caught
|
|
162
|
-
LOGGER.debug("Failed to collect evaluate failed row info for
|
|
157
|
+
LOGGER.debug(f"Failed to collect evaluate failed row info for {evaluator_name}: {e}")
|
|
163
158
|
evaluators_info.append(evaluator_info)
|
|
164
159
|
|
|
165
160
|
custom_dimensions = {"evaluators_info": json.dumps(evaluators_info)}
|
|
@@ -172,7 +167,7 @@ def log_evaluate_activity(func: Callable[P, EvaluationResult]) -> Callable[P, Ev
|
|
|
172
167
|
):
|
|
173
168
|
pass
|
|
174
169
|
except Exception as e: # pylint: disable=broad-exception-caught
|
|
175
|
-
LOGGER.debug("Failed to collect evaluate usage info:
|
|
170
|
+
LOGGER.debug(f"Failed to collect evaluate usage info: {e}")
|
|
176
171
|
|
|
177
172
|
return result
|
|
178
173
|
|
|
@@ -6,23 +6,14 @@ import logging
|
|
|
6
6
|
import os
|
|
7
7
|
import re
|
|
8
8
|
import tempfile
|
|
9
|
+
from collections import namedtuple
|
|
9
10
|
from pathlib import Path
|
|
10
|
-
from typing import Any, Dict, NamedTuple, Optional, Tuple, Union
|
|
11
|
-
import uuid
|
|
12
|
-
import base64
|
|
13
11
|
|
|
14
12
|
import pandas as pd
|
|
15
|
-
|
|
16
|
-
from
|
|
17
|
-
|
|
18
|
-
from azure.ai.evaluation.
|
|
19
|
-
DEFAULT_EVALUATION_RESULTS_FILE_NAME,
|
|
20
|
-
DefaultOpenEncoding,
|
|
21
|
-
EvaluationRunProperties,
|
|
22
|
-
Prefixes,
|
|
23
|
-
)
|
|
24
|
-
from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
|
|
25
|
-
from azure.ai.evaluation._model_configurations import AzureAIProject
|
|
13
|
+
|
|
14
|
+
from azure.ai.evaluation._constants import DEFAULT_EVALUATION_RESULTS_FILE_NAME, Prefixes
|
|
15
|
+
from azure.ai.evaluation._exceptions import EvaluationException, ErrorBlame, ErrorCategory, ErrorTarget
|
|
16
|
+
from azure.ai.evaluation._evaluate._eval_run import EvalRun
|
|
26
17
|
|
|
27
18
|
LOGGER = logging.getLogger(__name__)
|
|
28
19
|
|
|
@@ -31,45 +22,39 @@ AZURE_WORKSPACE_REGEX_FORMAT = (
|
|
|
31
22
|
"(/providers/Microsoft.MachineLearningServices)?/workspaces/([^/]+)$"
|
|
32
23
|
)
|
|
33
24
|
|
|
34
|
-
|
|
35
|
-
class AzureMLWorkspace(NamedTuple):
|
|
36
|
-
subscription_id: str
|
|
37
|
-
resource_group_name: str
|
|
38
|
-
workspace_name: str
|
|
25
|
+
AzureMLWorkspaceTriad = namedtuple("AzureMLWorkspace", ["subscription_id", "resource_group_name", "workspace_name"])
|
|
39
26
|
|
|
40
27
|
|
|
41
|
-
def is_none(value)
|
|
28
|
+
def is_none(value):
|
|
42
29
|
return value is None or str(value).lower() == "none"
|
|
43
30
|
|
|
44
31
|
|
|
45
|
-
def extract_workspace_triad_from_trace_provider( # pylint: disable=name-too-long
|
|
46
|
-
trace_provider: str,
|
|
47
|
-
) -> AzureMLWorkspace:
|
|
32
|
+
def extract_workspace_triad_from_trace_provider(trace_provider: str): # pylint: disable=name-too-long
|
|
48
33
|
match = re.match(AZURE_WORKSPACE_REGEX_FORMAT, trace_provider)
|
|
49
34
|
if not match or len(match.groups()) != 5:
|
|
50
35
|
raise EvaluationException(
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
36
|
+
message="Malformed trace provider string, expected azureml://subscriptions/<subscription_id>/"
|
|
37
|
+
"resourceGroups/<resource_group>/providers/Microsoft.MachineLearningServices/"
|
|
38
|
+
f"workspaces/<workspace_name>, got {trace_provider}",
|
|
39
|
+
internal_message="Malformed trace provider string, expected azureml://subscriptions/<subscription_id>/"
|
|
40
|
+
"resourceGroups/<resource_group>/providers/Microsoft.MachineLearningServices/"
|
|
41
|
+
"workspaces/<workspace_name>,",
|
|
42
|
+
target=ErrorTarget.UNKNOWN,
|
|
43
|
+
category=ErrorCategory.INVALID_VALUE,
|
|
44
|
+
blame=ErrorBlame.UNKNOWN,
|
|
45
|
+
)
|
|
61
46
|
subscription_id = match.group(1)
|
|
62
47
|
resource_group_name = match.group(3)
|
|
63
48
|
workspace_name = match.group(5)
|
|
64
|
-
return
|
|
49
|
+
return AzureMLWorkspaceTriad(subscription_id, resource_group_name, workspace_name)
|
|
65
50
|
|
|
66
51
|
|
|
67
52
|
def load_jsonl(path):
|
|
68
|
-
with open(path, "r", encoding=
|
|
53
|
+
with open(path, "r", encoding="utf-8") as f:
|
|
69
54
|
return [json.loads(line) for line in f.readlines()]
|
|
70
55
|
|
|
71
56
|
|
|
72
|
-
def _azure_pf_client_and_triad(trace_destination)
|
|
57
|
+
def _azure_pf_client_and_triad(trace_destination):
|
|
73
58
|
from promptflow.azure._cli._utils import _get_azure_pf_client
|
|
74
59
|
|
|
75
60
|
ws_triad = extract_workspace_triad_from_trace_provider(trace_destination)
|
|
@@ -82,45 +67,15 @@ def _azure_pf_client_and_triad(trace_destination) -> Tuple[PFClient, AzureMLWork
|
|
|
82
67
|
return azure_pf_client, ws_triad
|
|
83
68
|
|
|
84
69
|
|
|
85
|
-
def _store_multimodal_content(messages, tmpdir: str):
|
|
86
|
-
# verify if images folder exists
|
|
87
|
-
images_folder_path = os.path.join(tmpdir, "images")
|
|
88
|
-
os.makedirs(images_folder_path, exist_ok=True)
|
|
89
|
-
|
|
90
|
-
# traverse all messages and replace base64 image data with new file name.
|
|
91
|
-
for message in messages:
|
|
92
|
-
if isinstance(message.get("content", []), list):
|
|
93
|
-
for content in message.get("content", []):
|
|
94
|
-
if content.get("type") == "image_url":
|
|
95
|
-
image_url = content.get("image_url")
|
|
96
|
-
if image_url and "url" in image_url and image_url["url"].startswith("data:image/jpg;base64,"):
|
|
97
|
-
# Extract the base64 string
|
|
98
|
-
base64image = image_url["url"].replace("data:image/jpg;base64,", "")
|
|
99
|
-
|
|
100
|
-
# Generate a unique filename
|
|
101
|
-
image_file_name = f"{str(uuid.uuid4())}.jpg"
|
|
102
|
-
image_url["url"] = f"images/{image_file_name}" # Replace the base64 URL with the file path
|
|
103
|
-
|
|
104
|
-
# Decode the base64 string to binary image data
|
|
105
|
-
image_data_binary = base64.b64decode(base64image)
|
|
106
|
-
|
|
107
|
-
# Write the binary image data to the file
|
|
108
|
-
image_file_path = os.path.join(images_folder_path, image_file_name)
|
|
109
|
-
with open(image_file_path, "wb") as f:
|
|
110
|
-
f.write(image_data_binary)
|
|
111
|
-
|
|
112
|
-
|
|
113
70
|
def _log_metrics_and_instance_results(
|
|
114
|
-
metrics
|
|
115
|
-
instance_results
|
|
116
|
-
trace_destination
|
|
117
|
-
run
|
|
118
|
-
evaluation_name
|
|
119
|
-
) ->
|
|
120
|
-
from azure.ai.evaluation._evaluate._eval_run import EvalRun
|
|
121
|
-
|
|
71
|
+
metrics,
|
|
72
|
+
instance_results,
|
|
73
|
+
trace_destination,
|
|
74
|
+
run,
|
|
75
|
+
evaluation_name,
|
|
76
|
+
) -> str:
|
|
122
77
|
if trace_destination is None:
|
|
123
|
-
LOGGER.
|
|
78
|
+
LOGGER.error("Unable to log traces as trace destination was not defined.")
|
|
124
79
|
return None
|
|
125
80
|
|
|
126
81
|
azure_pf_client, ws_triad = _azure_pf_client_and_triad(trace_destination)
|
|
@@ -138,21 +93,13 @@ def _log_metrics_and_instance_results(
|
|
|
138
93
|
ml_client=azure_pf_client.ml_client,
|
|
139
94
|
promptflow_run=run,
|
|
140
95
|
) as ev_run:
|
|
141
|
-
|
|
96
|
+
|
|
97
|
+
artifact_name = EvalRun.EVALUATION_ARTIFACT if run else EvalRun.EVALUATION_ARTIFACT_DUMMY_RUN
|
|
142
98
|
|
|
143
99
|
with tempfile.TemporaryDirectory() as tmpdir:
|
|
144
|
-
# storing multi_modal images if exists
|
|
145
|
-
col_name = "inputs.conversation"
|
|
146
|
-
if col_name in instance_results.columns:
|
|
147
|
-
for item in instance_results[col_name].items():
|
|
148
|
-
value = item[1]
|
|
149
|
-
if "messages" in value:
|
|
150
|
-
_store_multimodal_content(value["messages"], tmpdir)
|
|
151
|
-
|
|
152
|
-
# storing artifact result
|
|
153
100
|
tmp_path = os.path.join(tmpdir, artifact_name)
|
|
154
101
|
|
|
155
|
-
with open(tmp_path, "w", encoding=
|
|
102
|
+
with open(tmp_path, "w", encoding="utf-8") as f:
|
|
156
103
|
f.write(instance_results.to_json(orient="records", lines=True))
|
|
157
104
|
|
|
158
105
|
ev_run.log_artifact(tmpdir, artifact_name)
|
|
@@ -164,9 +111,9 @@ def _log_metrics_and_instance_results(
|
|
|
164
111
|
if run is None:
|
|
165
112
|
ev_run.write_properties_to_run_history(
|
|
166
113
|
properties={
|
|
167
|
-
|
|
168
|
-
EvaluationRunProperties.EVALUATION_RUN: "promptflow.BatchRun",
|
|
114
|
+
"_azureml.evaluation_run": "azure-ai-generative-parent",
|
|
169
115
|
"_azureml.evaluate_artifacts": json.dumps([{"path": artifact_name, "type": "table"}]),
|
|
116
|
+
"isEvaluatorRun": "true",
|
|
170
117
|
}
|
|
171
118
|
)
|
|
172
119
|
|
|
@@ -190,7 +137,7 @@ def _get_ai_studio_url(trace_destination: str, evaluation_id: str) -> str:
|
|
|
190
137
|
return studio_url
|
|
191
138
|
|
|
192
139
|
|
|
193
|
-
def _trace_destination_from_project_scope(project_scope:
|
|
140
|
+
def _trace_destination_from_project_scope(project_scope: dict) -> str:
|
|
194
141
|
subscription_id = project_scope["subscription_id"]
|
|
195
142
|
resource_group_name = project_scope["resource_group_name"]
|
|
196
143
|
workspace_name = project_scope["project_name"]
|
|
@@ -203,20 +150,16 @@ def _trace_destination_from_project_scope(project_scope: AzureAIProject) -> str:
|
|
|
203
150
|
return trace_destination
|
|
204
151
|
|
|
205
152
|
|
|
206
|
-
def _write_output(path
|
|
153
|
+
def _write_output(path, data_dict):
|
|
207
154
|
p = Path(path)
|
|
208
|
-
if
|
|
155
|
+
if os.path.isdir(path):
|
|
209
156
|
p = p / DEFAULT_EVALUATION_RESULTS_FILE_NAME
|
|
210
157
|
|
|
211
|
-
with open(p, "w"
|
|
158
|
+
with open(p, "w") as f:
|
|
212
159
|
json.dump(data_dict, f)
|
|
213
160
|
|
|
214
|
-
print(f'Evaluation results saved to "{p.resolve()}".\n')
|
|
215
|
-
|
|
216
161
|
|
|
217
|
-
def _apply_column_mapping(
|
|
218
|
-
source_df: pd.DataFrame, mapping_config: Optional[Dict[str, str]], inplace: bool = False
|
|
219
|
-
) -> pd.DataFrame:
|
|
162
|
+
def _apply_column_mapping(source_df: pd.DataFrame, mapping_config: dict, inplace: bool = False) -> pd.DataFrame:
|
|
220
163
|
"""
|
|
221
164
|
Apply column mapping to source_df based on mapping_config.
|
|
222
165
|
|
|
@@ -224,11 +167,10 @@ def _apply_column_mapping(
|
|
|
224
167
|
:param source_df: the data frame to be changed.
|
|
225
168
|
:type source_df: pd.DataFrame
|
|
226
169
|
:param mapping_config: The configuration, containing column mapping.
|
|
227
|
-
:type mapping_config:
|
|
170
|
+
:type mapping_config: dict.
|
|
228
171
|
:param inplace: If true, the source_df will be changed inplace.
|
|
229
172
|
:type inplace: bool
|
|
230
173
|
:return: The modified data frame.
|
|
231
|
-
:rtype: pd.DataFrame
|
|
232
174
|
"""
|
|
233
175
|
result_df = source_df
|
|
234
176
|
|
|
@@ -265,34 +207,31 @@ def _apply_column_mapping(
|
|
|
265
207
|
return result_df
|
|
266
208
|
|
|
267
209
|
|
|
268
|
-
def _has_aggregator(evaluator
|
|
210
|
+
def _has_aggregator(evaluator):
|
|
269
211
|
return hasattr(evaluator, "__aggregate__")
|
|
270
212
|
|
|
271
213
|
|
|
272
|
-
def get_int_env_var(env_var_name
|
|
214
|
+
def get_int_env_var(env_var_name, default_value=None):
|
|
273
215
|
"""
|
|
274
|
-
The function `get_int_env_var` retrieves an integer environment variable value, with
|
|
216
|
+
The function `get_int_env_var` retrieves an integer environment variable value, with an optional
|
|
275
217
|
default value if the variable is not set or cannot be converted to an integer.
|
|
276
218
|
|
|
277
219
|
:param env_var_name: The name of the environment variable you want to retrieve the value of
|
|
278
|
-
:type env_var_name: str
|
|
279
220
|
:param default_value: The default value is the value that will be returned if the environment
|
|
280
|
-
|
|
281
|
-
:type default_value: int
|
|
221
|
+
variable is not found or if it cannot be converted to an integer
|
|
282
222
|
:return: an integer value.
|
|
283
|
-
:rtype: int
|
|
284
223
|
"""
|
|
285
224
|
try:
|
|
286
|
-
return int(os.environ
|
|
287
|
-
except
|
|
225
|
+
return int(os.environ.get(env_var_name, default_value))
|
|
226
|
+
except Exception:
|
|
288
227
|
return default_value
|
|
289
228
|
|
|
290
229
|
|
|
291
|
-
def set_event_loop_policy()
|
|
230
|
+
def set_event_loop_policy():
|
|
292
231
|
import asyncio
|
|
293
232
|
import platform
|
|
294
233
|
|
|
295
234
|
if platform.system().lower() == "windows":
|
|
296
235
|
# Reference: https://stackoverflow.com/questions/45600579/asyncio-event-loop-is-closed-when-getting-loop
|
|
297
236
|
# On Windows seems to be a problem with EventLoopPolicy, use this snippet to work around it
|
|
298
|
-
asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())
|
|
237
|
+
asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())
|
|
@@ -2,8 +2,8 @@
|
|
|
2
2
|
# Copyright (c) Microsoft Corporation. All rights reserved.
|
|
3
3
|
# ---------------------------------------------------------
|
|
4
4
|
from nltk.translate.bleu_score import SmoothingFunction, sentence_bleu
|
|
5
|
-
from promptflow._utils.async_utils import async_run_allowing_running_loop
|
|
6
5
|
|
|
6
|
+
from promptflow._utils.async_utils import async_run_allowing_running_loop
|
|
7
7
|
from azure.ai.evaluation._common.utils import nltk_tokenize
|
|
8
8
|
|
|
9
9
|
|
|
@@ -26,29 +26,30 @@ class _AsyncBleuScoreEvaluator:
|
|
|
26
26
|
|
|
27
27
|
class BleuScoreEvaluator:
|
|
28
28
|
"""
|
|
29
|
-
|
|
29
|
+
Evaluator that computes the BLEU Score between two strings.
|
|
30
30
|
|
|
31
31
|
BLEU (Bilingual Evaluation Understudy) score is commonly used in natural language processing (NLP) and machine
|
|
32
|
-
translation. It is widely used in text summarization and text generation use cases.
|
|
32
|
+
translation. It is widely used in text summarization and text generation use cases. It evaluates how closely the
|
|
33
|
+
generated text matches the reference text. The BLEU score ranges from 0 to 1, with higher scores indicating
|
|
34
|
+
better quality.
|
|
33
35
|
|
|
34
|
-
|
|
35
|
-
especially in tasks such as machine translation or text summarization, where n-gram overlap is a significant
|
|
36
|
-
indicator of quality.
|
|
36
|
+
**Usage**
|
|
37
37
|
|
|
38
|
-
|
|
38
|
+
.. code-block:: python
|
|
39
39
|
|
|
40
|
-
|
|
40
|
+
eval_fn = BleuScoreEvaluator()
|
|
41
|
+
result = eval_fn(
|
|
42
|
+
response="Tokyo is the capital of Japan.",
|
|
43
|
+
ground_truth="The capital of Japan is Tokyo.")
|
|
41
44
|
|
|
42
|
-
|
|
43
|
-
:start-after: [START bleu_score_evaluator]
|
|
44
|
-
:end-before: [END bleu_score_evaluator]
|
|
45
|
-
:language: python
|
|
46
|
-
:dedent: 8
|
|
47
|
-
:caption: Initialize and call an BleuScoreEvaluator.
|
|
48
|
-
"""
|
|
45
|
+
**Output format**
|
|
49
46
|
|
|
50
|
-
|
|
51
|
-
|
|
47
|
+
.. code-block:: python
|
|
48
|
+
|
|
49
|
+
{
|
|
50
|
+
"bleu_score": 0.22
|
|
51
|
+
}
|
|
52
|
+
"""
|
|
52
53
|
|
|
53
54
|
def __init__(self):
|
|
54
55
|
self._async_evaluator = _AsyncBleuScoreEvaluator()
|
|
@@ -62,7 +63,7 @@ class BleuScoreEvaluator:
|
|
|
62
63
|
:keyword ground_truth: The ground truth to be compared against.
|
|
63
64
|
:paramtype ground_truth: str
|
|
64
65
|
:return: The BLEU score.
|
|
65
|
-
:rtype:
|
|
66
|
+
:rtype: dict
|
|
66
67
|
"""
|
|
67
68
|
return async_run_allowing_running_loop(
|
|
68
69
|
self._async_evaluator, response=response, ground_truth=ground_truth, **kwargs
|
|
@@ -2,8 +2,8 @@
|
|
|
2
2
|
# Copyright (c) Microsoft Corporation. All rights reserved.
|
|
3
3
|
# ---------------------------------------------------------
|
|
4
4
|
|
|
5
|
-
from .
|
|
5
|
+
from ._chat import ChatEvaluator
|
|
6
6
|
|
|
7
7
|
__all__ = [
|
|
8
|
-
"
|
|
8
|
+
"ChatEvaluator",
|
|
9
9
|
]
|