azure-ai-evaluation 1.0.0b5__py3-none-any.whl → 1.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of azure-ai-evaluation might be problematic. Click here for more details.
- azure/ai/evaluation/_azure/__init__.py +3 -0
- azure/ai/evaluation/_azure/_clients.py +188 -0
- azure/ai/evaluation/_azure/_models.py +227 -0
- azure/ai/evaluation/_azure/_token_manager.py +118 -0
- azure/ai/evaluation/_common/_experimental.py +4 -0
- azure/ai/evaluation/_common/math.py +62 -2
- azure/ai/evaluation/_common/rai_service.py +110 -50
- azure/ai/evaluation/_common/utils.py +50 -16
- azure/ai/evaluation/_constants.py +2 -0
- azure/ai/evaluation/_evaluate/_batch_run/eval_run_context.py +9 -0
- azure/ai/evaluation/_evaluate/_batch_run/proxy_client.py +13 -3
- azure/ai/evaluation/_evaluate/_batch_run/target_run_context.py +12 -1
- azure/ai/evaluation/_evaluate/_eval_run.py +38 -43
- azure/ai/evaluation/_evaluate/_evaluate.py +62 -131
- azure/ai/evaluation/_evaluate/_telemetry/__init__.py +2 -1
- azure/ai/evaluation/_evaluate/_utils.py +72 -38
- azure/ai/evaluation/_evaluators/_bleu/_bleu.py +16 -17
- azure/ai/evaluation/_evaluators/_coherence/_coherence.py +60 -29
- azure/ai/evaluation/_evaluators/_common/_base_eval.py +88 -6
- azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +16 -3
- azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py +39 -10
- azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py +58 -52
- azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py +79 -34
- azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py +73 -34
- azure/ai/evaluation/_evaluators/_content_safety/_sexual.py +74 -33
- azure/ai/evaluation/_evaluators/_content_safety/_violence.py +76 -34
- azure/ai/evaluation/_evaluators/_eci/_eci.py +28 -3
- azure/ai/evaluation/_evaluators/_f1_score/_f1_score.py +20 -13
- azure/ai/evaluation/_evaluators/_fluency/_fluency.py +57 -26
- azure/ai/evaluation/_evaluators/_gleu/_gleu.py +13 -15
- azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +68 -30
- azure/ai/evaluation/_evaluators/_meteor/_meteor.py +17 -20
- azure/ai/evaluation/_evaluators/_multimodal/_content_safety_multimodal.py +10 -8
- azure/ai/evaluation/_evaluators/_multimodal/_content_safety_multimodal_base.py +0 -2
- azure/ai/evaluation/_evaluators/_multimodal/_hate_unfairness.py +6 -2
- azure/ai/evaluation/_evaluators/_multimodal/_protected_material.py +10 -6
- azure/ai/evaluation/_evaluators/_multimodal/_self_harm.py +6 -2
- azure/ai/evaluation/_evaluators/_multimodal/_sexual.py +6 -2
- azure/ai/evaluation/_evaluators/_multimodal/_violence.py +6 -2
- azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py +57 -34
- azure/ai/evaluation/_evaluators/_qa/_qa.py +25 -37
- azure/ai/evaluation/_evaluators/_relevance/_relevance.py +63 -29
- azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py +76 -161
- azure/ai/evaluation/_evaluators/_rouge/_rouge.py +24 -25
- azure/ai/evaluation/_evaluators/_service_groundedness/_service_groundedness.py +65 -67
- azure/ai/evaluation/_evaluators/_similarity/_similarity.py +26 -20
- azure/ai/evaluation/_evaluators/_xpia/xpia.py +74 -40
- azure/ai/evaluation/_exceptions.py +2 -0
- azure/ai/evaluation/_http_utils.py +6 -4
- azure/ai/evaluation/_model_configurations.py +65 -14
- azure/ai/evaluation/_vendor/rouge_score/rouge_scorer.py +0 -4
- azure/ai/evaluation/_vendor/rouge_score/scoring.py +0 -4
- azure/ai/evaluation/_vendor/rouge_score/tokenize.py +0 -4
- azure/ai/evaluation/_version.py +1 -1
- azure/ai/evaluation/simulator/_adversarial_scenario.py +17 -1
- azure/ai/evaluation/simulator/_adversarial_simulator.py +57 -47
- azure/ai/evaluation/simulator/_constants.py +11 -1
- azure/ai/evaluation/simulator/_conversation/__init__.py +128 -7
- azure/ai/evaluation/simulator/_conversation/_conversation.py +0 -1
- azure/ai/evaluation/simulator/_direct_attack_simulator.py +16 -8
- azure/ai/evaluation/simulator/_indirect_attack_simulator.py +12 -1
- azure/ai/evaluation/simulator/_model_tools/_identity_manager.py +3 -1
- azure/ai/evaluation/simulator/_model_tools/_rai_client.py +48 -4
- azure/ai/evaluation/simulator/_model_tools/_template_handler.py +1 -0
- azure/ai/evaluation/simulator/_simulator.py +54 -45
- azure/ai/evaluation/simulator/_utils.py +25 -7
- {azure_ai_evaluation-1.0.0b5.dist-info → azure_ai_evaluation-1.1.0.dist-info}/METADATA +240 -327
- {azure_ai_evaluation-1.0.0b5.dist-info → azure_ai_evaluation-1.1.0.dist-info}/RECORD +71 -68
- azure/ai/evaluation/_evaluators/_content_safety/_content_safety_chat.py +0 -322
- {azure_ai_evaluation-1.0.0b5.dist-info → azure_ai_evaluation-1.1.0.dist-info}/NOTICE.txt +0 -0
- {azure_ai_evaluation-1.0.0b5.dist-info → azure_ai_evaluation-1.1.0.dist-info}/WHEEL +0 -0
- {azure_ai_evaluation-1.0.0b5.dist-info → azure_ai_evaluation-1.1.0.dist-info}/top_level.txt +0 -0
|
@@ -3,24 +3,23 @@
|
|
|
3
3
|
# ---------------------------------------------------------
|
|
4
4
|
import inspect
|
|
5
5
|
import json
|
|
6
|
+
import logging
|
|
6
7
|
import os
|
|
7
8
|
import re
|
|
8
9
|
from typing import Any, Callable, Dict, List, Optional, Set, Tuple, TypedDict, TypeVar, Union
|
|
9
10
|
|
|
10
11
|
import pandas as pd
|
|
11
12
|
from promptflow._sdk._constants import LINE_NUMBER
|
|
12
|
-
from promptflow._sdk._errors import MissingAzurePackage, UserAuthenticationError, UploadInternalError
|
|
13
13
|
from promptflow.client import PFClient
|
|
14
14
|
from promptflow.entities import Run
|
|
15
15
|
|
|
16
|
-
from azure.ai.evaluation._common.math import
|
|
16
|
+
from azure.ai.evaluation._common.math import list_mean_nan_safe, apply_transform_nan_safe
|
|
17
17
|
from azure.ai.evaluation._common.utils import validate_azure_ai_project
|
|
18
18
|
from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
|
|
19
19
|
|
|
20
20
|
from .._constants import (
|
|
21
21
|
CONTENT_SAFETY_DEFECT_RATE_THRESHOLD_DEFAULT,
|
|
22
22
|
EvaluationMetrics,
|
|
23
|
-
EvaluationRunProperties,
|
|
24
23
|
Prefixes,
|
|
25
24
|
_InternalEvaluationMetrics,
|
|
26
25
|
)
|
|
@@ -35,6 +34,7 @@ from ._utils import (
|
|
|
35
34
|
)
|
|
36
35
|
|
|
37
36
|
TClient = TypeVar("TClient", ProxyClient, CodeClient)
|
|
37
|
+
LOGGER = logging.getLogger(__name__)
|
|
38
38
|
|
|
39
39
|
# For metrics (aggregates) whose metric names intentionally differ from their
|
|
40
40
|
# originating column name, usually because the aggregation of the original value
|
|
@@ -69,10 +69,11 @@ def _aggregate_other_metrics(df: pd.DataFrame) -> Tuple[List[str], Dict[str, flo
|
|
|
69
69
|
renamed_cols.append(col)
|
|
70
70
|
new_col_name = metric_prefix + "." + METRIC_COLUMN_NAME_REPLACEMENTS[metric_name]
|
|
71
71
|
col_with_numeric_values = pd.to_numeric(df[col], errors="coerce")
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
72
|
+
try:
|
|
73
|
+
metric_columns[new_col_name] = round(list_mean_nan_safe(col_with_numeric_values), 2)
|
|
74
|
+
except EvaluationException: # only exception that can be cause is all NaN values
|
|
75
|
+
msg = f"All score evaluations are NaN/None for column {col}. No aggregation can be performed."
|
|
76
|
+
LOGGER.warning(msg)
|
|
76
77
|
|
|
77
78
|
return renamed_cols, metric_columns
|
|
78
79
|
|
|
@@ -119,11 +120,15 @@ def _aggregate_content_safety_metrics(
|
|
|
119
120
|
for col in content_safety_df.columns:
|
|
120
121
|
defect_rate_name = col.replace("_score", "_defect_rate")
|
|
121
122
|
col_with_numeric_values = pd.to_numeric(content_safety_df[col], errors="coerce")
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
123
|
+
try:
|
|
124
|
+
col_with_boolean_values = apply_transform_nan_safe(
|
|
125
|
+
col_with_numeric_values, lambda x: 1 if x >= CONTENT_SAFETY_DEFECT_RATE_THRESHOLD_DEFAULT else 0
|
|
126
|
+
)
|
|
127
|
+
defect_rates[defect_rate_name] = round(list_mean_nan_safe(col_with_boolean_values), 2)
|
|
128
|
+
except EvaluationException: # only exception that can be cause is all NaN values
|
|
129
|
+
msg = f"All score evaluations are NaN/None for column {col}. No aggregation can be performed."
|
|
130
|
+
LOGGER.warning(msg)
|
|
131
|
+
|
|
127
132
|
return content_safety_cols, defect_rates
|
|
128
133
|
|
|
129
134
|
|
|
@@ -153,10 +158,11 @@ def _aggregate_label_defect_metrics(df: pd.DataFrame) -> Tuple[List[str], Dict[s
|
|
|
153
158
|
for col in label_df.columns:
|
|
154
159
|
defect_rate_name = col.replace("_label", "_defect_rate")
|
|
155
160
|
col_with_boolean_values = pd.to_numeric(label_df[col], errors="coerce")
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
161
|
+
try:
|
|
162
|
+
defect_rates[defect_rate_name] = round(list_mean_nan_safe(col_with_boolean_values), 2)
|
|
163
|
+
except EvaluationException: # only exception that can be cause is all NaN values
|
|
164
|
+
msg = f"All score evaluations are NaN/None for column {col}. No aggregation can be performed."
|
|
165
|
+
LOGGER.warning(msg)
|
|
160
166
|
return label_cols, defect_rates
|
|
161
167
|
|
|
162
168
|
|
|
@@ -193,6 +199,9 @@ def _aggregate_metrics(df: pd.DataFrame, evaluators: Dict[str, Callable]) -> Dic
|
|
|
193
199
|
# For rest of metrics, we will calculate mean
|
|
194
200
|
df.drop(columns=handled_columns, inplace=True)
|
|
195
201
|
|
|
202
|
+
# NOTE: nan/None values don't count as as booleans, so boolean columns with
|
|
203
|
+
# nan/None values won't have a mean produced from them.
|
|
204
|
+
# This is different from label-based known evaluators, which have special handling.
|
|
196
205
|
mean_value = df.mean(numeric_only=True)
|
|
197
206
|
metrics = mean_value.to_dict()
|
|
198
207
|
# Add defect rates back into metrics
|
|
@@ -287,7 +296,13 @@ def _validate_columns_for_evaluators(
|
|
|
287
296
|
# Ignore the missing fields if "conversation" presents in the input data
|
|
288
297
|
missing_inputs = []
|
|
289
298
|
else:
|
|
290
|
-
|
|
299
|
+
optional_params = (
|
|
300
|
+
evaluator._OPTIONAL_PARAMS # pylint: disable=protected-access
|
|
301
|
+
if hasattr(evaluator, "_OPTIONAL_PARAMS")
|
|
302
|
+
else []
|
|
303
|
+
)
|
|
304
|
+
excluded_params = set(new_df.columns).union(optional_params)
|
|
305
|
+
missing_inputs = [col for col in evaluator_params if col not in excluded_params]
|
|
291
306
|
|
|
292
307
|
# If "conversation" is the only parameter and it is missing, keep it in the missing inputs
|
|
293
308
|
# Otherwise, remove it from the missing inputs
|
|
@@ -391,7 +406,7 @@ def _validate_and_load_data(target, data, evaluators, output_path, azure_ai_proj
|
|
|
391
406
|
)
|
|
392
407
|
|
|
393
408
|
output_dir = output_path if os.path.isdir(output_path) else os.path.dirname(output_path)
|
|
394
|
-
if not os.path.exists(output_dir):
|
|
409
|
+
if output_dir and not os.path.exists(output_dir):
|
|
395
410
|
msg = f"The output directory '{output_dir}' does not exist. Please create the directory manually."
|
|
396
411
|
raise EvaluationException(
|
|
397
412
|
message=msg,
|
|
@@ -451,33 +466,14 @@ def _apply_target_to_data(
|
|
|
451
466
|
:rtype: Tuple[pandas.DataFrame, List[str]]
|
|
452
467
|
"""
|
|
453
468
|
_run_name = kwargs.get("_run_name")
|
|
454
|
-
|
|
455
|
-
|
|
456
|
-
|
|
457
|
-
|
|
458
|
-
|
|
459
|
-
|
|
460
|
-
|
|
461
|
-
|
|
462
|
-
properties={EvaluationRunProperties.RUN_TYPE: "eval_run", "isEvaluatorRun": "true"},
|
|
463
|
-
stream=True,
|
|
464
|
-
name=_run_name,
|
|
465
|
-
)
|
|
466
|
-
except (UserAuthenticationError, UploadInternalError) as ex:
|
|
467
|
-
if "Failed to upload run" in ex.message:
|
|
468
|
-
msg = (
|
|
469
|
-
"Failed to upload the target run to the cloud. "
|
|
470
|
-
"This may be caused by insufficient permission to access storage or other errors."
|
|
471
|
-
)
|
|
472
|
-
raise EvaluationException(
|
|
473
|
-
message=msg,
|
|
474
|
-
target=ErrorTarget.EVALUATE,
|
|
475
|
-
category=ErrorCategory.FAILED_REMOTE_TRACKING,
|
|
476
|
-
blame=ErrorBlame.USER_ERROR,
|
|
477
|
-
tsg_link="https://aka.ms/azsdk/python/evaluation/remotetracking/troubleshoot",
|
|
478
|
-
) from ex
|
|
479
|
-
|
|
480
|
-
raise ex
|
|
469
|
+
with TargetRunContext():
|
|
470
|
+
run: Run = pf_client.run(
|
|
471
|
+
flow=target,
|
|
472
|
+
display_name=evaluation_name,
|
|
473
|
+
data=data,
|
|
474
|
+
stream=True,
|
|
475
|
+
name=_run_name,
|
|
476
|
+
)
|
|
481
477
|
|
|
482
478
|
target_output: pd.DataFrame = pf_client.runs.get_details(run, all_results=True)
|
|
483
479
|
# Remove input and output prefix
|
|
@@ -601,48 +597,14 @@ def evaluate(
|
|
|
601
597
|
:return: Evaluation results.
|
|
602
598
|
:rtype: ~azure.ai.evaluation.EvaluationResult
|
|
603
599
|
|
|
604
|
-
|
|
605
|
-
|
|
606
|
-
Evaluate API can be used as follows:
|
|
607
|
-
|
|
608
|
-
.. code-block:: python
|
|
609
|
-
|
|
610
|
-
from azure.ai.evaluation import evaluate, RelevanceEvaluator, CoherenceEvaluator
|
|
611
|
-
|
|
612
|
-
|
|
613
|
-
model_config = {
|
|
614
|
-
"azure_endpoint": os.environ.get("AZURE_OPENAI_ENDPOINT"),
|
|
615
|
-
"api_key": os.environ.get("AZURE_OPENAI_KEY"),
|
|
616
|
-
"azure_deployment": os.environ.get("AZURE_OPENAI_DEPLOYMENT"),
|
|
617
|
-
}
|
|
618
|
-
|
|
619
|
-
coherence_eval = CoherenceEvaluator(model_config=model_config)
|
|
620
|
-
relevance_eval = RelevanceEvaluator(model_config=model_config)
|
|
621
|
-
|
|
622
|
-
path = "evaluate_test_data.jsonl"
|
|
623
|
-
result = evaluate(
|
|
624
|
-
data=path,
|
|
625
|
-
evaluators={
|
|
626
|
-
"coherence": coherence_eval,
|
|
627
|
-
"relevance": relevance_eval,
|
|
628
|
-
},
|
|
629
|
-
evaluator_config={
|
|
630
|
-
"coherence": {
|
|
631
|
-
"column_mapping": {
|
|
632
|
-
"response": "${data.response}",
|
|
633
|
-
"query": "${data.query}",
|
|
634
|
-
},
|
|
635
|
-
},
|
|
636
|
-
"relevance": {
|
|
637
|
-
"column_mapping": {
|
|
638
|
-
"response": "${data.response}",
|
|
639
|
-
"context": "${data.context}",
|
|
640
|
-
"query": "${data.query}",
|
|
641
|
-
},
|
|
642
|
-
},
|
|
643
|
-
},
|
|
644
|
-
)
|
|
600
|
+
.. admonition:: Example:
|
|
645
601
|
|
|
602
|
+
.. literalinclude:: ../samples/evaluation_samples_evaluate.py
|
|
603
|
+
:start-after: [START evaluate_method]
|
|
604
|
+
:end-before: [END evaluate_method]
|
|
605
|
+
:language: python
|
|
606
|
+
:dedent: 8
|
|
607
|
+
:caption: Run an evaluation on local data with Coherence and Relevance evaluators.
|
|
646
608
|
"""
|
|
647
609
|
try:
|
|
648
610
|
return _evaluate(
|
|
@@ -698,7 +660,7 @@ def _print_summary(per_evaluator_results: Dict[str, Any]) -> None:
|
|
|
698
660
|
if output_dict:
|
|
699
661
|
print("======= Combined Run Summary (Per Evaluator) =======\n")
|
|
700
662
|
print(json.dumps(output_dict, indent=4))
|
|
701
|
-
print("\n
|
|
663
|
+
print("\n====================================================\n")
|
|
702
664
|
|
|
703
665
|
|
|
704
666
|
def _evaluate( # pylint: disable=too-many-locals,too-many-statements
|
|
@@ -728,36 +690,7 @@ def _evaluate( # pylint: disable=too-many-locals,too-many-statements
|
|
|
728
690
|
if target is not None:
|
|
729
691
|
_validate_columns_for_target(input_data_df, target)
|
|
730
692
|
|
|
731
|
-
|
|
732
|
-
try:
|
|
733
|
-
pf_client = PFClient(
|
|
734
|
-
config=(
|
|
735
|
-
{"trace.destination": _trace_destination_from_project_scope(azure_ai_project)}
|
|
736
|
-
if azure_ai_project
|
|
737
|
-
else None
|
|
738
|
-
),
|
|
739
|
-
user_agent=USER_AGENT,
|
|
740
|
-
)
|
|
741
|
-
# pylint: disable=raise-missing-from
|
|
742
|
-
except MissingAzurePackage:
|
|
743
|
-
msg = (
|
|
744
|
-
"The required packages for remote tracking are missing.\n"
|
|
745
|
-
'To resolve this, please install them by running "pip install azure-ai-evaluation[remote]".'
|
|
746
|
-
)
|
|
747
|
-
|
|
748
|
-
raise EvaluationException( # pylint: disable=raise-missing-from
|
|
749
|
-
message=msg,
|
|
750
|
-
target=ErrorTarget.EVALUATE,
|
|
751
|
-
category=ErrorCategory.MISSING_PACKAGE,
|
|
752
|
-
blame=ErrorBlame.USER_ERROR,
|
|
753
|
-
)
|
|
754
|
-
|
|
755
|
-
trace_destination: Optional[str] = pf_client._config.get_trace_destination() # pylint: disable=protected-access
|
|
756
|
-
|
|
757
|
-
# Handle the case where the customer manually run "pf config set trace.destination=none"
|
|
758
|
-
if trace_destination and trace_destination.lower() == "none":
|
|
759
|
-
trace_destination = None
|
|
760
|
-
|
|
693
|
+
pf_client = PFClient(user_agent=USER_AGENT)
|
|
761
694
|
target_run: Optional[Run] = None
|
|
762
695
|
|
|
763
696
|
# Create default configuration for evaluators that directly maps
|
|
@@ -831,11 +764,7 @@ def _evaluate( # pylint: disable=too-many-locals,too-many-statements
|
|
|
831
764
|
# Ensure the absolute path is passed to pf.run, as relative path doesn't work with
|
|
832
765
|
# multiple evaluators. If the path is already absolute, abspath will return the original path.
|
|
833
766
|
data = os.path.abspath(data)
|
|
834
|
-
|
|
835
|
-
# A user reported intermittent errors when PFClient uploads evaluation runs to the cloud.
|
|
836
|
-
# The root cause is still unclear, but it seems related to a conflict between the async run uploader
|
|
837
|
-
# and the async batch run. As a quick mitigation, use a PFClient without a trace destination for batch runs.
|
|
838
|
-
per_evaluator_results = eval_batch_run(ProxyClient(PFClient(user_agent=USER_AGENT)), data=data)
|
|
767
|
+
per_evaluator_results = eval_batch_run(ProxyClient(pf_client), data=data)
|
|
839
768
|
else:
|
|
840
769
|
data = input_data_df
|
|
841
770
|
per_evaluator_results = eval_batch_run(CodeClient(), data=input_data_df)
|
|
@@ -877,20 +806,22 @@ def _evaluate( # pylint: disable=too-many-locals,too-many-statements
|
|
|
877
806
|
result_df = pd.concat([input_data_df, evaluators_result_df], axis=1, verify_integrity=True)
|
|
878
807
|
metrics = _aggregate_metrics(evaluators_result_df, evaluators)
|
|
879
808
|
metrics.update(evaluators_metric)
|
|
880
|
-
|
|
881
|
-
|
|
882
|
-
|
|
883
|
-
|
|
884
|
-
|
|
885
|
-
|
|
886
|
-
|
|
809
|
+
|
|
810
|
+
# Since tracing is disabled, pass None for target_run so a dummy evaluation run will be created each time.
|
|
811
|
+
target_run = None
|
|
812
|
+
trace_destination = _trace_destination_from_project_scope(azure_ai_project) if azure_ai_project else None
|
|
813
|
+
studio_url = None
|
|
814
|
+
if trace_destination:
|
|
815
|
+
studio_url = _log_metrics_and_instance_results(
|
|
816
|
+
metrics, result_df, trace_destination, target_run, evaluation_name, **kwargs
|
|
817
|
+
)
|
|
887
818
|
|
|
888
819
|
result_df_dict = result_df.to_dict("records")
|
|
889
820
|
result: EvaluationResult = {"rows": result_df_dict, "metrics": metrics, "studio_url": studio_url} # type: ignore
|
|
890
821
|
|
|
822
|
+
_print_summary(per_evaluator_results)
|
|
823
|
+
|
|
891
824
|
if output_path:
|
|
892
825
|
_write_output(output_path, result)
|
|
893
826
|
|
|
894
|
-
_print_summary(per_evaluator_results)
|
|
895
|
-
|
|
896
827
|
return result
|
|
@@ -123,7 +123,8 @@ def log_evaluate_activity(func: Callable[P, EvaluationResult]) -> Callable[P, Ev
|
|
|
123
123
|
user_agent=USER_AGENT,
|
|
124
124
|
)
|
|
125
125
|
|
|
126
|
-
|
|
126
|
+
trace_destination = pf_client._config.get_trace_destination() # pylint: disable=protected-access
|
|
127
|
+
track_in_cloud = bool(trace_destination) if trace_destination != "none" else False
|
|
127
128
|
evaluate_target = bool(kwargs.get("target", None))
|
|
128
129
|
evaluator_config = bool(kwargs.get("evaluator_config", None))
|
|
129
130
|
custom_dimensions: Dict[str, Union[str, bool]] = {
|
|
@@ -7,12 +7,11 @@ import os
|
|
|
7
7
|
import re
|
|
8
8
|
import tempfile
|
|
9
9
|
from pathlib import Path
|
|
10
|
-
from typing import Any, Dict, NamedTuple, Optional,
|
|
10
|
+
from typing import Any, Dict, NamedTuple, Optional, Union, cast
|
|
11
11
|
import uuid
|
|
12
12
|
import base64
|
|
13
13
|
|
|
14
14
|
import pandas as pd
|
|
15
|
-
from promptflow.client import PFClient
|
|
16
15
|
from promptflow.entities import Run
|
|
17
16
|
|
|
18
17
|
from azure.ai.evaluation._constants import (
|
|
@@ -21,9 +20,10 @@ from azure.ai.evaluation._constants import (
|
|
|
21
20
|
EvaluationRunProperties,
|
|
22
21
|
Prefixes,
|
|
23
22
|
)
|
|
24
|
-
from azure.ai.evaluation._evaluate._eval_run import EvalRun
|
|
25
23
|
from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
|
|
26
24
|
from azure.ai.evaluation._model_configurations import AzureAIProject
|
|
25
|
+
from azure.ai.evaluation._version import VERSION
|
|
26
|
+
from azure.ai.evaluation._azure._clients import LiteMLClient
|
|
27
27
|
|
|
28
28
|
LOGGER = logging.getLogger(__name__)
|
|
29
29
|
|
|
@@ -46,6 +46,8 @@ def is_none(value) -> bool:
|
|
|
46
46
|
def extract_workspace_triad_from_trace_provider( # pylint: disable=name-too-long
|
|
47
47
|
trace_provider: str,
|
|
48
48
|
) -> AzureMLWorkspace:
|
|
49
|
+
from promptflow._cli._utils import get_workspace_triad_from_local
|
|
50
|
+
|
|
49
51
|
match = re.match(AZURE_WORKSPACE_REGEX_FORMAT, trace_provider)
|
|
50
52
|
if not match or len(match.groups()) != 5:
|
|
51
53
|
raise EvaluationException(
|
|
@@ -59,10 +61,20 @@ def extract_workspace_triad_from_trace_provider( # pylint: disable=name-too-lon
|
|
|
59
61
|
category=ErrorCategory.INVALID_VALUE,
|
|
60
62
|
blame=ErrorBlame.UNKNOWN,
|
|
61
63
|
)
|
|
64
|
+
|
|
62
65
|
subscription_id = match.group(1)
|
|
63
66
|
resource_group_name = match.group(3)
|
|
64
67
|
workspace_name = match.group(5)
|
|
65
|
-
|
|
68
|
+
|
|
69
|
+
# In theory this if statement should never evaluate to True, but we'll keep it here just in case
|
|
70
|
+
# for backwards compatibility with what the original code that depended on promptflow-azure did
|
|
71
|
+
if not (subscription_id and resource_group_name and workspace_name):
|
|
72
|
+
local = get_workspace_triad_from_local()
|
|
73
|
+
subscription_id = subscription_id or local.subscription_id or os.getenv("AZUREML_ARM_SUBSCRIPTION")
|
|
74
|
+
resource_group_name = resource_group_name or local.resource_group_name or os.getenv("AZUREML_ARM_RESOURCEGROUP")
|
|
75
|
+
workspace_name = workspace_name or local.workspace_name or os.getenv("AZUREML_ARM_WORKSPACE_NAME")
|
|
76
|
+
|
|
77
|
+
return AzureMLWorkspace(subscription_id or "", resource_group_name or "", workspace_name or "")
|
|
66
78
|
|
|
67
79
|
|
|
68
80
|
def load_jsonl(path):
|
|
@@ -70,19 +82,6 @@ def load_jsonl(path):
|
|
|
70
82
|
return [json.loads(line) for line in f.readlines()]
|
|
71
83
|
|
|
72
84
|
|
|
73
|
-
def _azure_pf_client_and_triad(trace_destination) -> Tuple[PFClient, AzureMLWorkspace]:
|
|
74
|
-
from promptflow.azure._cli._utils import _get_azure_pf_client
|
|
75
|
-
|
|
76
|
-
ws_triad = extract_workspace_triad_from_trace_provider(trace_destination)
|
|
77
|
-
azure_pf_client = _get_azure_pf_client(
|
|
78
|
-
subscription_id=ws_triad.subscription_id,
|
|
79
|
-
resource_group=ws_triad.resource_group_name,
|
|
80
|
-
workspace_name=ws_triad.workspace_name,
|
|
81
|
-
)
|
|
82
|
-
|
|
83
|
-
return azure_pf_client, ws_triad
|
|
84
|
-
|
|
85
|
-
|
|
86
85
|
def _store_multimodal_content(messages, tmpdir: str):
|
|
87
86
|
# verify if images folder exists
|
|
88
87
|
images_folder_path = os.path.join(tmpdir, "images")
|
|
@@ -92,23 +91,40 @@ def _store_multimodal_content(messages, tmpdir: str):
|
|
|
92
91
|
for message in messages:
|
|
93
92
|
if isinstance(message.get("content", []), list):
|
|
94
93
|
for content in message.get("content", []):
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
94
|
+
process_message_content(content, images_folder_path)
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
def process_message_content(content, images_folder_path):
|
|
98
|
+
if content.get("type", "") == "image_url":
|
|
99
|
+
image_url = content.get("image_url")
|
|
100
100
|
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
image_url["url"] = f"images/{image_file_name}" # Replace the base64 URL with the file path
|
|
101
|
+
if not image_url or "url" not in image_url:
|
|
102
|
+
return None
|
|
104
103
|
|
|
105
|
-
|
|
106
|
-
|
|
104
|
+
url = image_url["url"]
|
|
105
|
+
if not url.startswith("data:image/"):
|
|
106
|
+
return None
|
|
107
107
|
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
108
|
+
match = re.search("data:image/([^;]+);", url)
|
|
109
|
+
if not match:
|
|
110
|
+
return None
|
|
111
|
+
|
|
112
|
+
ext = match.group(1)
|
|
113
|
+
# Extract the base64 string
|
|
114
|
+
base64image = image_url["url"].replace(f"data:image/{ext};base64,", "")
|
|
115
|
+
|
|
116
|
+
# Generate a unique filename
|
|
117
|
+
image_file_name = f"{str(uuid.uuid4())}.{ext}"
|
|
118
|
+
image_url["url"] = f"images/{image_file_name}" # Replace the base64 URL with the file path
|
|
119
|
+
|
|
120
|
+
# Decode the base64 string to binary image data
|
|
121
|
+
image_data_binary = base64.b64decode(base64image)
|
|
122
|
+
|
|
123
|
+
# Write the binary image data to the file
|
|
124
|
+
image_file_path = os.path.join(images_folder_path, image_file_name)
|
|
125
|
+
with open(image_file_path, "wb") as f:
|
|
126
|
+
f.write(image_data_binary)
|
|
127
|
+
return None
|
|
112
128
|
|
|
113
129
|
|
|
114
130
|
def _log_metrics_and_instance_results(
|
|
@@ -117,27 +133,37 @@ def _log_metrics_and_instance_results(
|
|
|
117
133
|
trace_destination: Optional[str],
|
|
118
134
|
run: Run,
|
|
119
135
|
evaluation_name: Optional[str],
|
|
136
|
+
**kwargs,
|
|
120
137
|
) -> Optional[str]:
|
|
138
|
+
from azure.ai.evaluation._evaluate._eval_run import EvalRun
|
|
139
|
+
|
|
121
140
|
if trace_destination is None:
|
|
122
141
|
LOGGER.debug("Skip uploading evaluation results to AI Studio since no trace destination was provided.")
|
|
123
142
|
return None
|
|
124
143
|
|
|
125
|
-
|
|
126
|
-
|
|
144
|
+
ws_triad = extract_workspace_triad_from_trace_provider(trace_destination)
|
|
145
|
+
management_client = LiteMLClient(
|
|
146
|
+
subscription_id=ws_triad.subscription_id,
|
|
147
|
+
resource_group=ws_triad.resource_group_name,
|
|
148
|
+
logger=LOGGER,
|
|
149
|
+
credential=kwargs.get("credential"),
|
|
150
|
+
# let the client automatically determine the credentials to use
|
|
151
|
+
)
|
|
152
|
+
tracking_uri = management_client.workspace_get_info(ws_triad.workspace_name).ml_flow_tracking_uri
|
|
127
153
|
|
|
128
154
|
# Adding line_number as index column this is needed by UI to form link to individual instance run
|
|
129
155
|
instance_results["line_number"] = instance_results.index.values
|
|
130
156
|
|
|
131
157
|
with EvalRun(
|
|
132
158
|
run_name=run.name if run is not None else evaluation_name,
|
|
133
|
-
tracking_uri=tracking_uri,
|
|
159
|
+
tracking_uri=cast(str, tracking_uri),
|
|
134
160
|
subscription_id=ws_triad.subscription_id,
|
|
135
161
|
group_name=ws_triad.resource_group_name,
|
|
136
162
|
workspace_name=ws_triad.workspace_name,
|
|
137
|
-
|
|
163
|
+
management_client=management_client,
|
|
138
164
|
promptflow_run=run,
|
|
139
165
|
) as ev_run:
|
|
140
|
-
artifact_name = EvalRun.EVALUATION_ARTIFACT
|
|
166
|
+
artifact_name = EvalRun.EVALUATION_ARTIFACT
|
|
141
167
|
|
|
142
168
|
with tempfile.TemporaryDirectory() as tmpdir:
|
|
143
169
|
# storing multi_modal images if exists
|
|
@@ -164,9 +190,15 @@ def _log_metrics_and_instance_results(
|
|
|
164
190
|
ev_run.write_properties_to_run_history(
|
|
165
191
|
properties={
|
|
166
192
|
EvaluationRunProperties.RUN_TYPE: "eval_run",
|
|
167
|
-
EvaluationRunProperties.EVALUATION_RUN: "
|
|
193
|
+
EvaluationRunProperties.EVALUATION_RUN: "promptflow.BatchRun",
|
|
194
|
+
EvaluationRunProperties.EVALUATION_SDK: f"azure-ai-evaluation:{VERSION}",
|
|
168
195
|
"_azureml.evaluate_artifacts": json.dumps([{"path": artifact_name, "type": "table"}]),
|
|
169
|
-
|
|
196
|
+
}
|
|
197
|
+
)
|
|
198
|
+
else:
|
|
199
|
+
ev_run.write_properties_to_run_history(
|
|
200
|
+
properties={
|
|
201
|
+
EvaluationRunProperties.EVALUATION_SDK: f"azure-ai-evaluation:{VERSION}",
|
|
170
202
|
}
|
|
171
203
|
)
|
|
172
204
|
|
|
@@ -211,6 +243,8 @@ def _write_output(path: Union[str, os.PathLike], data_dict: Any) -> None:
|
|
|
211
243
|
with open(p, "w", encoding=DefaultOpenEncoding.WRITE) as f:
|
|
212
244
|
json.dump(data_dict, f)
|
|
213
245
|
|
|
246
|
+
print(f'Evaluation results saved to "{p.resolve()}".\n')
|
|
247
|
+
|
|
214
248
|
|
|
215
249
|
def _apply_column_mapping(
|
|
216
250
|
source_df: pd.DataFrame, mapping_config: Optional[Dict[str, str]], inplace: bool = False
|
|
@@ -26,31 +26,30 @@ class _AsyncBleuScoreEvaluator:
|
|
|
26
26
|
|
|
27
27
|
class BleuScoreEvaluator:
|
|
28
28
|
"""
|
|
29
|
-
|
|
29
|
+
Calculate the BLEU score for a given response and ground truth.
|
|
30
30
|
|
|
31
31
|
BLEU (Bilingual Evaluation Understudy) score is commonly used in natural language processing (NLP) and machine
|
|
32
|
-
translation. It is widely used in text summarization and text generation use cases.
|
|
33
|
-
generated text matches the reference text. The BLEU score ranges from 0 to 1, with higher scores indicating
|
|
34
|
-
better quality.
|
|
32
|
+
translation. It is widely used in text summarization and text generation use cases.
|
|
35
33
|
|
|
36
|
-
|
|
34
|
+
Use the BLEU score when you want to evaluate the similarity between the generated text and reference text,
|
|
35
|
+
especially in tasks such as machine translation or text summarization, where n-gram overlap is a significant
|
|
36
|
+
indicator of quality.
|
|
37
37
|
|
|
38
|
-
|
|
38
|
+
The BLEU score ranges from 0 to 1, with higher scores indicating better quality.
|
|
39
39
|
|
|
40
|
-
|
|
41
|
-
result = eval_fn(
|
|
42
|
-
response="Tokyo is the capital of Japan.",
|
|
43
|
-
ground_truth="The capital of Japan is Tokyo.")
|
|
40
|
+
.. admonition:: Example:
|
|
44
41
|
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
}
|
|
42
|
+
.. literalinclude:: ../samples/evaluation_samples_evaluate.py
|
|
43
|
+
:start-after: [START bleu_score_evaluator]
|
|
44
|
+
:end-before: [END bleu_score_evaluator]
|
|
45
|
+
:language: python
|
|
46
|
+
:dedent: 8
|
|
47
|
+
:caption: Initialize and call an BleuScoreEvaluator.
|
|
52
48
|
"""
|
|
53
49
|
|
|
50
|
+
id = "azureml://registries/azureml/models/Bleu-Score-Evaluator/versions/3"
|
|
51
|
+
"""Evaluator identifier, experimental and to be used only with evaluation in cloud."""
|
|
52
|
+
|
|
54
53
|
def __init__(self):
|
|
55
54
|
self._async_evaluator = _AsyncBleuScoreEvaluator()
|
|
56
55
|
|