azure-ai-evaluation 1.0.0b5__py3-none-any.whl → 1.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- azure/ai/evaluation/_common/_experimental.py +4 -0
- azure/ai/evaluation/_common/math.py +62 -2
- azure/ai/evaluation/_common/rai_service.py +80 -29
- azure/ai/evaluation/_common/utils.py +50 -16
- azure/ai/evaluation/_constants.py +1 -0
- azure/ai/evaluation/_evaluate/_batch_run/eval_run_context.py +9 -0
- azure/ai/evaluation/_evaluate/_batch_run/proxy_client.py +13 -3
- azure/ai/evaluation/_evaluate/_batch_run/target_run_context.py +11 -0
- azure/ai/evaluation/_evaluate/_eval_run.py +34 -10
- azure/ai/evaluation/_evaluate/_evaluate.py +59 -103
- azure/ai/evaluation/_evaluate/_telemetry/__init__.py +2 -1
- azure/ai/evaluation/_evaluate/_utils.py +6 -4
- azure/ai/evaluation/_evaluators/_bleu/_bleu.py +16 -17
- azure/ai/evaluation/_evaluators/_coherence/_coherence.py +60 -29
- azure/ai/evaluation/_evaluators/_common/_base_eval.py +17 -5
- azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +4 -2
- azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py +6 -9
- azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py +56 -50
- azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py +79 -34
- azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py +73 -34
- azure/ai/evaluation/_evaluators/_content_safety/_sexual.py +74 -33
- azure/ai/evaluation/_evaluators/_content_safety/_violence.py +76 -34
- azure/ai/evaluation/_evaluators/_eci/_eci.py +28 -3
- azure/ai/evaluation/_evaluators/_f1_score/_f1_score.py +20 -13
- azure/ai/evaluation/_evaluators/_fluency/_fluency.py +57 -26
- azure/ai/evaluation/_evaluators/_gleu/_gleu.py +13 -15
- azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +68 -30
- azure/ai/evaluation/_evaluators/_meteor/_meteor.py +17 -20
- azure/ai/evaluation/_evaluators/_multimodal/_content_safety_multimodal.py +10 -8
- azure/ai/evaluation/_evaluators/_multimodal/_content_safety_multimodal_base.py +0 -2
- azure/ai/evaluation/_evaluators/_multimodal/_hate_unfairness.py +6 -2
- azure/ai/evaluation/_evaluators/_multimodal/_protected_material.py +10 -6
- azure/ai/evaluation/_evaluators/_multimodal/_self_harm.py +6 -2
- azure/ai/evaluation/_evaluators/_multimodal/_sexual.py +6 -2
- azure/ai/evaluation/_evaluators/_multimodal/_violence.py +6 -2
- azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py +57 -34
- azure/ai/evaluation/_evaluators/_qa/_qa.py +25 -37
- azure/ai/evaluation/_evaluators/_relevance/_relevance.py +63 -29
- azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py +76 -161
- azure/ai/evaluation/_evaluators/_rouge/_rouge.py +24 -25
- azure/ai/evaluation/_evaluators/_service_groundedness/_service_groundedness.py +65 -67
- azure/ai/evaluation/_evaluators/_similarity/_similarity.py +26 -20
- azure/ai/evaluation/_evaluators/_xpia/xpia.py +74 -40
- azure/ai/evaluation/_exceptions.py +2 -0
- azure/ai/evaluation/_model_configurations.py +65 -14
- azure/ai/evaluation/_version.py +1 -1
- azure/ai/evaluation/simulator/_adversarial_scenario.py +15 -1
- azure/ai/evaluation/simulator/_adversarial_simulator.py +25 -34
- azure/ai/evaluation/simulator/_constants.py +11 -1
- azure/ai/evaluation/simulator/_direct_attack_simulator.py +16 -8
- azure/ai/evaluation/simulator/_indirect_attack_simulator.py +11 -1
- azure/ai/evaluation/simulator/_model_tools/_identity_manager.py +3 -1
- azure/ai/evaluation/simulator/_model_tools/_rai_client.py +8 -4
- azure/ai/evaluation/simulator/_simulator.py +51 -45
- azure/ai/evaluation/simulator/_utils.py +25 -7
- {azure_ai_evaluation-1.0.0b5.dist-info → azure_ai_evaluation-1.0.1.dist-info}/METADATA +232 -324
- {azure_ai_evaluation-1.0.0b5.dist-info → azure_ai_evaluation-1.0.1.dist-info}/RECORD +60 -61
- azure/ai/evaluation/_evaluators/_content_safety/_content_safety_chat.py +0 -322
- {azure_ai_evaluation-1.0.0b5.dist-info → azure_ai_evaluation-1.0.1.dist-info}/NOTICE.txt +0 -0
- {azure_ai_evaluation-1.0.0b5.dist-info → azure_ai_evaluation-1.0.1.dist-info}/WHEEL +0 -0
- {azure_ai_evaluation-1.0.0b5.dist-info → azure_ai_evaluation-1.0.1.dist-info}/top_level.txt +0 -0
|
@@ -3,17 +3,18 @@
|
|
|
3
3
|
# ---------------------------------------------------------
|
|
4
4
|
import inspect
|
|
5
5
|
import json
|
|
6
|
+
import logging
|
|
6
7
|
import os
|
|
7
8
|
import re
|
|
8
9
|
from typing import Any, Callable, Dict, List, Optional, Set, Tuple, TypedDict, TypeVar, Union
|
|
9
10
|
|
|
10
11
|
import pandas as pd
|
|
11
12
|
from promptflow._sdk._constants import LINE_NUMBER
|
|
12
|
-
from promptflow._sdk._errors import
|
|
13
|
+
from promptflow._sdk._errors import UserAuthenticationError, UploadInternalError
|
|
13
14
|
from promptflow.client import PFClient
|
|
14
15
|
from promptflow.entities import Run
|
|
15
16
|
|
|
16
|
-
from azure.ai.evaluation._common.math import
|
|
17
|
+
from azure.ai.evaluation._common.math import list_mean_nan_safe, apply_transform_nan_safe
|
|
17
18
|
from azure.ai.evaluation._common.utils import validate_azure_ai_project
|
|
18
19
|
from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
|
|
19
20
|
|
|
@@ -35,6 +36,7 @@ from ._utils import (
|
|
|
35
36
|
)
|
|
36
37
|
|
|
37
38
|
TClient = TypeVar("TClient", ProxyClient, CodeClient)
|
|
39
|
+
LOGGER = logging.getLogger(__name__)
|
|
38
40
|
|
|
39
41
|
# For metrics (aggregates) whose metric names intentionally differ from their
|
|
40
42
|
# originating column name, usually because the aggregation of the original value
|
|
@@ -69,10 +71,11 @@ def _aggregate_other_metrics(df: pd.DataFrame) -> Tuple[List[str], Dict[str, flo
|
|
|
69
71
|
renamed_cols.append(col)
|
|
70
72
|
new_col_name = metric_prefix + "." + METRIC_COLUMN_NAME_REPLACEMENTS[metric_name]
|
|
71
73
|
col_with_numeric_values = pd.to_numeric(df[col], errors="coerce")
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
74
|
+
try:
|
|
75
|
+
metric_columns[new_col_name] = round(list_mean_nan_safe(col_with_numeric_values), 2)
|
|
76
|
+
except EvaluationException: # only exception that can be cause is all NaN values
|
|
77
|
+
msg = f"All score evaluations are NaN/None for column {col}. No aggregation can be performed."
|
|
78
|
+
LOGGER.warning(msg)
|
|
76
79
|
|
|
77
80
|
return renamed_cols, metric_columns
|
|
78
81
|
|
|
@@ -119,11 +122,15 @@ def _aggregate_content_safety_metrics(
|
|
|
119
122
|
for col in content_safety_df.columns:
|
|
120
123
|
defect_rate_name = col.replace("_score", "_defect_rate")
|
|
121
124
|
col_with_numeric_values = pd.to_numeric(content_safety_df[col], errors="coerce")
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
125
|
+
try:
|
|
126
|
+
col_with_boolean_values = apply_transform_nan_safe(
|
|
127
|
+
col_with_numeric_values, lambda x: 1 if x >= CONTENT_SAFETY_DEFECT_RATE_THRESHOLD_DEFAULT else 0
|
|
128
|
+
)
|
|
129
|
+
defect_rates[defect_rate_name] = round(list_mean_nan_safe(col_with_boolean_values), 2)
|
|
130
|
+
except EvaluationException: # only exception that can be cause is all NaN values
|
|
131
|
+
msg = f"All score evaluations are NaN/None for column {col}. No aggregation can be performed."
|
|
132
|
+
LOGGER.warning(msg)
|
|
133
|
+
|
|
127
134
|
return content_safety_cols, defect_rates
|
|
128
135
|
|
|
129
136
|
|
|
@@ -153,10 +160,11 @@ def _aggregate_label_defect_metrics(df: pd.DataFrame) -> Tuple[List[str], Dict[s
|
|
|
153
160
|
for col in label_df.columns:
|
|
154
161
|
defect_rate_name = col.replace("_label", "_defect_rate")
|
|
155
162
|
col_with_boolean_values = pd.to_numeric(label_df[col], errors="coerce")
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
163
|
+
try:
|
|
164
|
+
defect_rates[defect_rate_name] = round(list_mean_nan_safe(col_with_boolean_values), 2)
|
|
165
|
+
except EvaluationException: # only exception that can be cause is all NaN values
|
|
166
|
+
msg = f"All score evaluations are NaN/None for column {col}. No aggregation can be performed."
|
|
167
|
+
LOGGER.warning(msg)
|
|
160
168
|
return label_cols, defect_rates
|
|
161
169
|
|
|
162
170
|
|
|
@@ -193,6 +201,9 @@ def _aggregate_metrics(df: pd.DataFrame, evaluators: Dict[str, Callable]) -> Dic
|
|
|
193
201
|
# For rest of metrics, we will calculate mean
|
|
194
202
|
df.drop(columns=handled_columns, inplace=True)
|
|
195
203
|
|
|
204
|
+
# NOTE: nan/None values don't count as as booleans, so boolean columns with
|
|
205
|
+
# nan/None values won't have a mean produced from them.
|
|
206
|
+
# This is different from label-based known evaluators, which have special handling.
|
|
196
207
|
mean_value = df.mean(numeric_only=True)
|
|
197
208
|
metrics = mean_value.to_dict()
|
|
198
209
|
# Add defect rates back into metrics
|
|
@@ -287,7 +298,13 @@ def _validate_columns_for_evaluators(
|
|
|
287
298
|
# Ignore the missing fields if "conversation" presents in the input data
|
|
288
299
|
missing_inputs = []
|
|
289
300
|
else:
|
|
290
|
-
|
|
301
|
+
optional_params = (
|
|
302
|
+
evaluator._OPTIONAL_PARAMS # pylint: disable=protected-access
|
|
303
|
+
if hasattr(evaluator, "_OPTIONAL_PARAMS")
|
|
304
|
+
else []
|
|
305
|
+
)
|
|
306
|
+
excluded_params = set(new_df.columns).union(optional_params)
|
|
307
|
+
missing_inputs = [col for col in evaluator_params if col not in excluded_params]
|
|
291
308
|
|
|
292
309
|
# If "conversation" is the only parameter and it is missing, keep it in the missing inputs
|
|
293
310
|
# Otherwise, remove it from the missing inputs
|
|
@@ -391,7 +408,7 @@ def _validate_and_load_data(target, data, evaluators, output_path, azure_ai_proj
|
|
|
391
408
|
)
|
|
392
409
|
|
|
393
410
|
output_dir = output_path if os.path.isdir(output_path) else os.path.dirname(output_path)
|
|
394
|
-
if not os.path.exists(output_dir):
|
|
411
|
+
if output_dir and not os.path.exists(output_dir):
|
|
395
412
|
msg = f"The output directory '{output_dir}' does not exist. Please create the directory manually."
|
|
396
413
|
raise EvaluationException(
|
|
397
414
|
message=msg,
|
|
@@ -601,48 +618,14 @@ def evaluate(
|
|
|
601
618
|
:return: Evaluation results.
|
|
602
619
|
:rtype: ~azure.ai.evaluation.EvaluationResult
|
|
603
620
|
|
|
604
|
-
|
|
605
|
-
|
|
606
|
-
Evaluate API can be used as follows:
|
|
607
|
-
|
|
608
|
-
.. code-block:: python
|
|
609
|
-
|
|
610
|
-
from azure.ai.evaluation import evaluate, RelevanceEvaluator, CoherenceEvaluator
|
|
611
|
-
|
|
612
|
-
|
|
613
|
-
model_config = {
|
|
614
|
-
"azure_endpoint": os.environ.get("AZURE_OPENAI_ENDPOINT"),
|
|
615
|
-
"api_key": os.environ.get("AZURE_OPENAI_KEY"),
|
|
616
|
-
"azure_deployment": os.environ.get("AZURE_OPENAI_DEPLOYMENT"),
|
|
617
|
-
}
|
|
618
|
-
|
|
619
|
-
coherence_eval = CoherenceEvaluator(model_config=model_config)
|
|
620
|
-
relevance_eval = RelevanceEvaluator(model_config=model_config)
|
|
621
|
-
|
|
622
|
-
path = "evaluate_test_data.jsonl"
|
|
623
|
-
result = evaluate(
|
|
624
|
-
data=path,
|
|
625
|
-
evaluators={
|
|
626
|
-
"coherence": coherence_eval,
|
|
627
|
-
"relevance": relevance_eval,
|
|
628
|
-
},
|
|
629
|
-
evaluator_config={
|
|
630
|
-
"coherence": {
|
|
631
|
-
"column_mapping": {
|
|
632
|
-
"response": "${data.response}",
|
|
633
|
-
"query": "${data.query}",
|
|
634
|
-
},
|
|
635
|
-
},
|
|
636
|
-
"relevance": {
|
|
637
|
-
"column_mapping": {
|
|
638
|
-
"response": "${data.response}",
|
|
639
|
-
"context": "${data.context}",
|
|
640
|
-
"query": "${data.query}",
|
|
641
|
-
},
|
|
642
|
-
},
|
|
643
|
-
},
|
|
644
|
-
)
|
|
621
|
+
.. admonition:: Example:
|
|
645
622
|
|
|
623
|
+
.. literalinclude:: ../samples/evaluation_samples_evaluate.py
|
|
624
|
+
:start-after: [START evaluate_method]
|
|
625
|
+
:end-before: [END evaluate_method]
|
|
626
|
+
:language: python
|
|
627
|
+
:dedent: 8
|
|
628
|
+
:caption: Run an evaluation on local data with Coherence and Relevance evaluators.
|
|
646
629
|
"""
|
|
647
630
|
try:
|
|
648
631
|
return _evaluate(
|
|
@@ -698,7 +681,7 @@ def _print_summary(per_evaluator_results: Dict[str, Any]) -> None:
|
|
|
698
681
|
if output_dict:
|
|
699
682
|
print("======= Combined Run Summary (Per Evaluator) =======\n")
|
|
700
683
|
print(json.dumps(output_dict, indent=4))
|
|
701
|
-
print("\n
|
|
684
|
+
print("\n====================================================\n")
|
|
702
685
|
|
|
703
686
|
|
|
704
687
|
def _evaluate( # pylint: disable=too-many-locals,too-many-statements
|
|
@@ -728,36 +711,7 @@ def _evaluate( # pylint: disable=too-many-locals,too-many-statements
|
|
|
728
711
|
if target is not None:
|
|
729
712
|
_validate_columns_for_target(input_data_df, target)
|
|
730
713
|
|
|
731
|
-
|
|
732
|
-
try:
|
|
733
|
-
pf_client = PFClient(
|
|
734
|
-
config=(
|
|
735
|
-
{"trace.destination": _trace_destination_from_project_scope(azure_ai_project)}
|
|
736
|
-
if azure_ai_project
|
|
737
|
-
else None
|
|
738
|
-
),
|
|
739
|
-
user_agent=USER_AGENT,
|
|
740
|
-
)
|
|
741
|
-
# pylint: disable=raise-missing-from
|
|
742
|
-
except MissingAzurePackage:
|
|
743
|
-
msg = (
|
|
744
|
-
"The required packages for remote tracking are missing.\n"
|
|
745
|
-
'To resolve this, please install them by running "pip install azure-ai-evaluation[remote]".'
|
|
746
|
-
)
|
|
747
|
-
|
|
748
|
-
raise EvaluationException( # pylint: disable=raise-missing-from
|
|
749
|
-
message=msg,
|
|
750
|
-
target=ErrorTarget.EVALUATE,
|
|
751
|
-
category=ErrorCategory.MISSING_PACKAGE,
|
|
752
|
-
blame=ErrorBlame.USER_ERROR,
|
|
753
|
-
)
|
|
754
|
-
|
|
755
|
-
trace_destination: Optional[str] = pf_client._config.get_trace_destination() # pylint: disable=protected-access
|
|
756
|
-
|
|
757
|
-
# Handle the case where the customer manually run "pf config set trace.destination=none"
|
|
758
|
-
if trace_destination and trace_destination.lower() == "none":
|
|
759
|
-
trace_destination = None
|
|
760
|
-
|
|
714
|
+
pf_client = PFClient(user_agent=USER_AGENT)
|
|
761
715
|
target_run: Optional[Run] = None
|
|
762
716
|
|
|
763
717
|
# Create default configuration for evaluators that directly maps
|
|
@@ -831,11 +785,7 @@ def _evaluate( # pylint: disable=too-many-locals,too-many-statements
|
|
|
831
785
|
# Ensure the absolute path is passed to pf.run, as relative path doesn't work with
|
|
832
786
|
# multiple evaluators. If the path is already absolute, abspath will return the original path.
|
|
833
787
|
data = os.path.abspath(data)
|
|
834
|
-
|
|
835
|
-
# A user reported intermittent errors when PFClient uploads evaluation runs to the cloud.
|
|
836
|
-
# The root cause is still unclear, but it seems related to a conflict between the async run uploader
|
|
837
|
-
# and the async batch run. As a quick mitigation, use a PFClient without a trace destination for batch runs.
|
|
838
|
-
per_evaluator_results = eval_batch_run(ProxyClient(PFClient(user_agent=USER_AGENT)), data=data)
|
|
788
|
+
per_evaluator_results = eval_batch_run(ProxyClient(pf_client), data=data)
|
|
839
789
|
else:
|
|
840
790
|
data = input_data_df
|
|
841
791
|
per_evaluator_results = eval_batch_run(CodeClient(), data=input_data_df)
|
|
@@ -877,20 +827,26 @@ def _evaluate( # pylint: disable=too-many-locals,too-many-statements
|
|
|
877
827
|
result_df = pd.concat([input_data_df, evaluators_result_df], axis=1, verify_integrity=True)
|
|
878
828
|
metrics = _aggregate_metrics(evaluators_result_df, evaluators)
|
|
879
829
|
metrics.update(evaluators_metric)
|
|
880
|
-
|
|
881
|
-
|
|
882
|
-
|
|
883
|
-
|
|
884
|
-
|
|
885
|
-
|
|
886
|
-
|
|
830
|
+
|
|
831
|
+
# Since tracing is disabled, pass None for target_run so a dummy evaluation run will be created each time.
|
|
832
|
+
target_run = None
|
|
833
|
+
trace_destination = _trace_destination_from_project_scope(azure_ai_project) if azure_ai_project else None
|
|
834
|
+
studio_url = None
|
|
835
|
+
if trace_destination:
|
|
836
|
+
studio_url = _log_metrics_and_instance_results(
|
|
837
|
+
metrics,
|
|
838
|
+
result_df,
|
|
839
|
+
trace_destination,
|
|
840
|
+
target_run,
|
|
841
|
+
evaluation_name,
|
|
842
|
+
)
|
|
887
843
|
|
|
888
844
|
result_df_dict = result_df.to_dict("records")
|
|
889
845
|
result: EvaluationResult = {"rows": result_df_dict, "metrics": metrics, "studio_url": studio_url} # type: ignore
|
|
890
846
|
|
|
847
|
+
_print_summary(per_evaluator_results)
|
|
848
|
+
|
|
891
849
|
if output_path:
|
|
892
850
|
_write_output(output_path, result)
|
|
893
851
|
|
|
894
|
-
_print_summary(per_evaluator_results)
|
|
895
|
-
|
|
896
852
|
return result
|
|
@@ -123,7 +123,8 @@ def log_evaluate_activity(func: Callable[P, EvaluationResult]) -> Callable[P, Ev
|
|
|
123
123
|
user_agent=USER_AGENT,
|
|
124
124
|
)
|
|
125
125
|
|
|
126
|
-
|
|
126
|
+
trace_destination = pf_client._config.get_trace_destination() # pylint: disable=protected-access
|
|
127
|
+
track_in_cloud = bool(trace_destination) if trace_destination != "none" else False
|
|
127
128
|
evaluate_target = bool(kwargs.get("target", None))
|
|
128
129
|
evaluator_config = bool(kwargs.get("evaluator_config", None))
|
|
129
130
|
custom_dimensions: Dict[str, Union[str, bool]] = {
|
|
@@ -21,7 +21,6 @@ from azure.ai.evaluation._constants import (
|
|
|
21
21
|
EvaluationRunProperties,
|
|
22
22
|
Prefixes,
|
|
23
23
|
)
|
|
24
|
-
from azure.ai.evaluation._evaluate._eval_run import EvalRun
|
|
25
24
|
from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
|
|
26
25
|
from azure.ai.evaluation._model_configurations import AzureAIProject
|
|
27
26
|
|
|
@@ -118,6 +117,8 @@ def _log_metrics_and_instance_results(
|
|
|
118
117
|
run: Run,
|
|
119
118
|
evaluation_name: Optional[str],
|
|
120
119
|
) -> Optional[str]:
|
|
120
|
+
from azure.ai.evaluation._evaluate._eval_run import EvalRun
|
|
121
|
+
|
|
121
122
|
if trace_destination is None:
|
|
122
123
|
LOGGER.debug("Skip uploading evaluation results to AI Studio since no trace destination was provided.")
|
|
123
124
|
return None
|
|
@@ -137,7 +138,7 @@ def _log_metrics_and_instance_results(
|
|
|
137
138
|
ml_client=azure_pf_client.ml_client,
|
|
138
139
|
promptflow_run=run,
|
|
139
140
|
) as ev_run:
|
|
140
|
-
artifact_name = EvalRun.EVALUATION_ARTIFACT
|
|
141
|
+
artifact_name = EvalRun.EVALUATION_ARTIFACT
|
|
141
142
|
|
|
142
143
|
with tempfile.TemporaryDirectory() as tmpdir:
|
|
143
144
|
# storing multi_modal images if exists
|
|
@@ -164,9 +165,8 @@ def _log_metrics_and_instance_results(
|
|
|
164
165
|
ev_run.write_properties_to_run_history(
|
|
165
166
|
properties={
|
|
166
167
|
EvaluationRunProperties.RUN_TYPE: "eval_run",
|
|
167
|
-
EvaluationRunProperties.EVALUATION_RUN: "
|
|
168
|
+
EvaluationRunProperties.EVALUATION_RUN: "promptflow.BatchRun",
|
|
168
169
|
"_azureml.evaluate_artifacts": json.dumps([{"path": artifact_name, "type": "table"}]),
|
|
169
|
-
"isEvaluatorRun": "true",
|
|
170
170
|
}
|
|
171
171
|
)
|
|
172
172
|
|
|
@@ -211,6 +211,8 @@ def _write_output(path: Union[str, os.PathLike], data_dict: Any) -> None:
|
|
|
211
211
|
with open(p, "w", encoding=DefaultOpenEncoding.WRITE) as f:
|
|
212
212
|
json.dump(data_dict, f)
|
|
213
213
|
|
|
214
|
+
print(f'Evaluation results saved to "{p.resolve()}".\n')
|
|
215
|
+
|
|
214
216
|
|
|
215
217
|
def _apply_column_mapping(
|
|
216
218
|
source_df: pd.DataFrame, mapping_config: Optional[Dict[str, str]], inplace: bool = False
|
|
@@ -26,31 +26,30 @@ class _AsyncBleuScoreEvaluator:
|
|
|
26
26
|
|
|
27
27
|
class BleuScoreEvaluator:
|
|
28
28
|
"""
|
|
29
|
-
|
|
29
|
+
Calculate the BLEU score for a given response and ground truth.
|
|
30
30
|
|
|
31
31
|
BLEU (Bilingual Evaluation Understudy) score is commonly used in natural language processing (NLP) and machine
|
|
32
|
-
translation. It is widely used in text summarization and text generation use cases.
|
|
33
|
-
generated text matches the reference text. The BLEU score ranges from 0 to 1, with higher scores indicating
|
|
34
|
-
better quality.
|
|
32
|
+
translation. It is widely used in text summarization and text generation use cases.
|
|
35
33
|
|
|
36
|
-
|
|
34
|
+
Use the BLEU score when you want to evaluate the similarity between the generated text and reference text,
|
|
35
|
+
especially in tasks such as machine translation or text summarization, where n-gram overlap is a significant
|
|
36
|
+
indicator of quality.
|
|
37
37
|
|
|
38
|
-
|
|
38
|
+
The BLEU score ranges from 0 to 1, with higher scores indicating better quality.
|
|
39
39
|
|
|
40
|
-
|
|
41
|
-
result = eval_fn(
|
|
42
|
-
response="Tokyo is the capital of Japan.",
|
|
43
|
-
ground_truth="The capital of Japan is Tokyo.")
|
|
40
|
+
.. admonition:: Example:
|
|
44
41
|
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
}
|
|
42
|
+
.. literalinclude:: ../samples/evaluation_samples_evaluate.py
|
|
43
|
+
:start-after: [START bleu_score_evaluator]
|
|
44
|
+
:end-before: [END bleu_score_evaluator]
|
|
45
|
+
:language: python
|
|
46
|
+
:dedent: 8
|
|
47
|
+
:caption: Initialize and call an BleuScoreEvaluator.
|
|
52
48
|
"""
|
|
53
49
|
|
|
50
|
+
id = "azureml://registries/azureml/models/Bleu-Score-Evaluator/versions/3"
|
|
51
|
+
"""Evaluator identifier, experimental and to be used only with evaluation in cloud."""
|
|
52
|
+
|
|
54
53
|
def __init__(self):
|
|
55
54
|
self._async_evaluator = _AsyncBleuScoreEvaluator()
|
|
56
55
|
|
|
@@ -2,70 +2,101 @@
|
|
|
2
2
|
# Copyright (c) Microsoft Corporation. All rights reserved.
|
|
3
3
|
# ---------------------------------------------------------
|
|
4
4
|
import os
|
|
5
|
-
from typing import
|
|
5
|
+
from typing import Dict, Union, List
|
|
6
6
|
|
|
7
|
-
from typing_extensions import override
|
|
7
|
+
from typing_extensions import overload, override
|
|
8
8
|
|
|
9
9
|
from azure.ai.evaluation._evaluators._common import PromptyEvaluatorBase
|
|
10
|
+
from azure.ai.evaluation._model_configurations import Conversation
|
|
10
11
|
|
|
11
12
|
|
|
12
|
-
class CoherenceEvaluator(PromptyEvaluatorBase):
|
|
13
|
+
class CoherenceEvaluator(PromptyEvaluatorBase[Union[str, float]]):
|
|
13
14
|
"""
|
|
14
|
-
|
|
15
|
+
Evaluates coherence score for a given query and response or a multi-turn conversation, including reasoning.
|
|
16
|
+
|
|
17
|
+
The coherence measure assesses the ability of the language model to generate text that reads naturally,
|
|
18
|
+
flows smoothly, and resembles human-like language in its responses. Use it when assessing the readability
|
|
19
|
+
and user-friendliness of a model's generated responses in real-world applications.
|
|
15
20
|
|
|
16
21
|
:param model_config: Configuration for the Azure OpenAI model.
|
|
17
22
|
:type model_config: Union[~azure.ai.evaluation.AzureOpenAIModelConfiguration,
|
|
18
23
|
~azure.ai.evaluation.OpenAIModelConfiguration]
|
|
19
24
|
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
.. code-block:: python
|
|
23
|
-
|
|
24
|
-
eval_fn = CoherenceEvaluator(model_config)
|
|
25
|
-
result = eval_fn(
|
|
26
|
-
query="What is the capital of Japan?",
|
|
27
|
-
response="The capital of Japan is Tokyo.")
|
|
25
|
+
.. admonition:: Example:
|
|
28
26
|
|
|
29
|
-
|
|
27
|
+
.. literalinclude:: ../samples/evaluation_samples_evaluate.py
|
|
28
|
+
:start-after: [START coherence_evaluator]
|
|
29
|
+
:end-before: [END coherence_evaluator]
|
|
30
|
+
:language: python
|
|
31
|
+
:dedent: 8
|
|
32
|
+
:caption: Initialize and call a CoherenceEvaluator with a query and response.
|
|
30
33
|
|
|
31
|
-
..
|
|
34
|
+
.. note::
|
|
32
35
|
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
}
|
|
37
|
-
|
|
38
|
-
Note: To align with our support of a diverse set of models, a key without the `gpt_` prefix has been added.
|
|
39
|
-
To maintain backwards compatibility, the old key with the `gpt_` prefix is still be present in the output;
|
|
40
|
-
however, it is recommended to use the new key moving forward as the old key will be deprecated in the future.
|
|
36
|
+
To align with our support of a diverse set of models, an output key without the `gpt_` prefix has been added.
|
|
37
|
+
To maintain backwards compatibility, the old key with the `gpt_` prefix is still be present in the output;
|
|
38
|
+
however, it is recommended to use the new key moving forward as the old key will be deprecated in the future.
|
|
41
39
|
"""
|
|
42
40
|
|
|
43
41
|
_PROMPTY_FILE = "coherence.prompty"
|
|
44
42
|
_RESULT_KEY = "coherence"
|
|
45
43
|
|
|
44
|
+
id = "azureml://registries/azureml/models/Coherence-Evaluator/versions/4"
|
|
45
|
+
"""Evaluator identifier, experimental and to be used only with evaluation in cloud."""
|
|
46
|
+
|
|
46
47
|
@override
|
|
47
48
|
def __init__(self, model_config):
|
|
48
49
|
current_dir = os.path.dirname(__file__)
|
|
49
50
|
prompty_path = os.path.join(current_dir, self._PROMPTY_FILE)
|
|
50
51
|
super().__init__(model_config=model_config, prompty_file=prompty_path, result_key=self._RESULT_KEY)
|
|
51
52
|
|
|
52
|
-
@
|
|
53
|
+
@overload
|
|
54
|
+
def __call__(
|
|
55
|
+
self,
|
|
56
|
+
*,
|
|
57
|
+
query: str,
|
|
58
|
+
response: str,
|
|
59
|
+
) -> Dict[str, Union[str, float]]:
|
|
60
|
+
"""Evaluate coherence for given input of query, response
|
|
61
|
+
|
|
62
|
+
:keyword query: The query to be evaluated.
|
|
63
|
+
:paramtype query: str
|
|
64
|
+
:keyword response: The response to be evaluated.
|
|
65
|
+
:paramtype response: str
|
|
66
|
+
:return: The coherence score.
|
|
67
|
+
:rtype: Dict[str, float]
|
|
68
|
+
"""
|
|
69
|
+
|
|
70
|
+
@overload
|
|
53
71
|
def __call__(
|
|
54
72
|
self,
|
|
55
73
|
*,
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
conversation
|
|
74
|
+
conversation: Conversation,
|
|
75
|
+
) -> Dict[str, Union[float, Dict[str, List[Union[str, float]]]]]:
|
|
76
|
+
"""Evaluate coherence for a conversation
|
|
77
|
+
|
|
78
|
+
:keyword conversation: The conversation to evaluate. Expected to contain a list of conversation turns under the
|
|
79
|
+
key "messages", and potentially a global context under the key "context". Conversation turns are expected
|
|
80
|
+
to be dictionaries with keys "content", "role", and possibly "context".
|
|
81
|
+
:paramtype conversation: Optional[~azure.ai.evaluation.Conversation]
|
|
82
|
+
:return: The coherence score.
|
|
83
|
+
:rtype: Dict[str, Union[float, Dict[str, List[float]]]]
|
|
84
|
+
"""
|
|
85
|
+
|
|
86
|
+
@override
|
|
87
|
+
def __call__( # pylint: disable=docstring-missing-param
|
|
88
|
+
self,
|
|
89
|
+
*args,
|
|
59
90
|
**kwargs,
|
|
60
91
|
):
|
|
61
92
|
"""Evaluate coherence. Accepts either a query and response for a single evaluation,
|
|
62
93
|
or a conversation for a potentially multi-turn evaluation. If the conversation has more than one pair of
|
|
63
94
|
turns, the evaluator will aggregate the results of each turn.
|
|
64
95
|
|
|
96
|
+
:keyword query: The query to be evaluated.
|
|
97
|
+
:paramtype query: str
|
|
65
98
|
:keyword response: The response to be evaluated.
|
|
66
99
|
:paramtype response: Optional[str]
|
|
67
|
-
:keyword context: The context to be evaluated.
|
|
68
|
-
:paramtype context: Optional[str]
|
|
69
100
|
:keyword conversation: The conversation to evaluate. Expected to contain a list of conversation turns under the
|
|
70
101
|
key "messages". Conversation turns are expected
|
|
71
102
|
to be dictionaries with keys "content" and "role".
|
|
@@ -73,4 +104,4 @@ class CoherenceEvaluator(PromptyEvaluatorBase):
|
|
|
73
104
|
:return: The relevance score.
|
|
74
105
|
:rtype: Union[Dict[str, float], Dict[str, Union[float, Dict[str, List[float]]]]]
|
|
75
106
|
"""
|
|
76
|
-
return super().__call__(
|
|
107
|
+
return super().__call__(*args, **kwargs)
|
|
@@ -7,7 +7,7 @@ from abc import ABC, abstractmethod
|
|
|
7
7
|
from typing import Any, Callable, Dict, Generic, List, TypedDict, TypeVar, Union, cast, final
|
|
8
8
|
|
|
9
9
|
from promptflow._utils.async_utils import async_run_allowing_running_loop
|
|
10
|
-
from typing_extensions import ParamSpec, TypeAlias
|
|
10
|
+
from typing_extensions import ParamSpec, TypeAlias, get_overloads
|
|
11
11
|
|
|
12
12
|
from azure.ai.evaluation._common.math import list_mean
|
|
13
13
|
from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
|
|
@@ -88,7 +88,11 @@ class EvaluatorBase(ABC, Generic[T_EvalValue]):
|
|
|
88
88
|
# This needs to be overridden just to change the function header into something more informative,
|
|
89
89
|
# and to be able to add a more specific docstring. The actual function contents should just be
|
|
90
90
|
# super().__call__(<inputs>)
|
|
91
|
-
def __call__(
|
|
91
|
+
def __call__( # pylint: disable=docstring-missing-param
|
|
92
|
+
self,
|
|
93
|
+
*args,
|
|
94
|
+
**kwargs,
|
|
95
|
+
) -> Union[DoEvalResult[T_EvalValue], AggregateResult[T_EvalValue]]:
|
|
92
96
|
"""Evaluate a given input. This method serves as a wrapper and is meant to be overridden by child classes for
|
|
93
97
|
one main reason - to overwrite the method headers and docstring to include additional inputs as needed.
|
|
94
98
|
The actual behavior of this function shouldn't change beyond adding more inputs to the
|
|
@@ -127,11 +131,19 @@ class EvaluatorBase(ABC, Generic[T_EvalValue]):
|
|
|
127
131
|
:rtype: List[str]
|
|
128
132
|
"""
|
|
129
133
|
|
|
134
|
+
overloads = get_overloads(self.__call__)
|
|
135
|
+
if not overloads:
|
|
136
|
+
call_signatures = [inspect.signature(self.__call__)]
|
|
137
|
+
else:
|
|
138
|
+
call_signatures = [inspect.signature(overload) for overload in overloads]
|
|
130
139
|
call_signature = inspect.signature(self.__call__)
|
|
131
140
|
singletons = []
|
|
132
|
-
for
|
|
133
|
-
|
|
134
|
-
|
|
141
|
+
for call_signature in call_signatures:
|
|
142
|
+
params = call_signature.parameters
|
|
143
|
+
if any(not_singleton_input in params for not_singleton_input in self._not_singleton_inputs):
|
|
144
|
+
continue
|
|
145
|
+
# exclude self since it is not a singleton input
|
|
146
|
+
singletons.extend([p for p in params if p != "self"])
|
|
135
147
|
return singletons
|
|
136
148
|
|
|
137
149
|
def _derive_conversation_converter(self) -> Callable[[Dict], List[DerivedEvalInput]]:
|
|
@@ -4,7 +4,7 @@
|
|
|
4
4
|
|
|
5
5
|
import math
|
|
6
6
|
import re
|
|
7
|
-
from typing import Dict, Union
|
|
7
|
+
from typing import Dict, TypeVar, Union
|
|
8
8
|
|
|
9
9
|
from promptflow.core import AsyncPrompty
|
|
10
10
|
from typing_extensions import override
|
|
@@ -18,8 +18,10 @@ try:
|
|
|
18
18
|
except ImportError:
|
|
19
19
|
USER_AGENT = "None"
|
|
20
20
|
|
|
21
|
+
T = TypeVar("T")
|
|
21
22
|
|
|
22
|
-
|
|
23
|
+
|
|
24
|
+
class PromptyEvaluatorBase(EvaluatorBase[T]):
|
|
23
25
|
"""Base class for all evaluators that make use of context as an input. It's also assumed that such evaluators
|
|
24
26
|
make use of a prompty file, and return their results as a dictionary, with a single key-value pair
|
|
25
27
|
linking the result name to a float value (unless multi-turn evaluation occurs, in which case the
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
# ---------------------------------------------------------
|
|
2
2
|
# Copyright (c) Microsoft Corporation. All rights reserved.
|
|
3
3
|
# ---------------------------------------------------------
|
|
4
|
-
from typing import Dict,
|
|
4
|
+
from typing import Dict, TypeVar, Union
|
|
5
5
|
|
|
6
6
|
from typing_extensions import override
|
|
7
7
|
|
|
@@ -18,7 +18,7 @@ from azure.core.credentials import TokenCredential
|
|
|
18
18
|
|
|
19
19
|
from . import EvaluatorBase
|
|
20
20
|
|
|
21
|
-
T =
|
|
21
|
+
T = TypeVar("T")
|
|
22
22
|
|
|
23
23
|
|
|
24
24
|
class RaiServiceEvaluatorBase(EvaluatorBase[T]):
|
|
@@ -50,12 +50,9 @@ class RaiServiceEvaluatorBase(EvaluatorBase[T]):
|
|
|
50
50
|
self._credential = credential
|
|
51
51
|
|
|
52
52
|
@override
|
|
53
|
-
def __call__(
|
|
53
|
+
def __call__( # pylint: disable=docstring-missing-param
|
|
54
54
|
self,
|
|
55
|
-
|
|
56
|
-
query: Optional[str] = None,
|
|
57
|
-
response: Optional[str] = None,
|
|
58
|
-
conversation=None,
|
|
55
|
+
*args,
|
|
59
56
|
**kwargs,
|
|
60
57
|
):
|
|
61
58
|
"""Evaluate either a query and response or a conversation. Must supply either a query AND response,
|
|
@@ -71,7 +68,7 @@ class RaiServiceEvaluatorBase(EvaluatorBase[T]):
|
|
|
71
68
|
:paramtype conversation: Optional[~azure.ai.evaluation.Conversation]
|
|
72
69
|
:rtype: Union[Dict[str, T], Dict[str, Union[float, Dict[str, List[T]]]]]
|
|
73
70
|
"""
|
|
74
|
-
return super().__call__(
|
|
71
|
+
return super().__call__(*args, **kwargs)
|
|
75
72
|
|
|
76
73
|
@override
|
|
77
74
|
async def _do_eval(self, eval_input: Dict) -> Dict[str, T]:
|
|
@@ -108,7 +105,7 @@ class RaiServiceEvaluatorBase(EvaluatorBase[T]):
|
|
|
108
105
|
)
|
|
109
106
|
input_data["context"] = context
|
|
110
107
|
|
|
111
|
-
return await evaluate_with_rai_service(
|
|
108
|
+
return await evaluate_with_rai_service( # type: ignore
|
|
112
109
|
metric_name=self._eval_metric,
|
|
113
110
|
data=input_data,
|
|
114
111
|
project_scope=self._azure_ai_project,
|