azure-ai-evaluation 1.5.0__py3-none-any.whl → 1.6.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of azure-ai-evaluation might be problematic. Click here for more details.
- azure/ai/evaluation/__init__.py +9 -0
- azure/ai/evaluation/_aoai/__init__.py +10 -0
- azure/ai/evaluation/_aoai/aoai_grader.py +89 -0
- azure/ai/evaluation/_aoai/label_grader.py +66 -0
- azure/ai/evaluation/_aoai/string_check_grader.py +65 -0
- azure/ai/evaluation/_aoai/text_similarity_grader.py +88 -0
- azure/ai/evaluation/_azure/_clients.py +4 -4
- azure/ai/evaluation/_azure/_envs.py +208 -0
- azure/ai/evaluation/_azure/_token_manager.py +12 -7
- azure/ai/evaluation/_common/__init__.py +5 -0
- azure/ai/evaluation/_common/evaluation_onedp_client.py +118 -0
- azure/ai/evaluation/_common/onedp/__init__.py +32 -0
- azure/ai/evaluation/_common/onedp/_client.py +139 -0
- azure/ai/evaluation/_common/onedp/_configuration.py +73 -0
- azure/ai/evaluation/_common/onedp/_model_base.py +1232 -0
- azure/ai/evaluation/_common/onedp/_patch.py +21 -0
- azure/ai/evaluation/_common/onedp/_serialization.py +2032 -0
- azure/ai/evaluation/_common/onedp/_types.py +21 -0
- azure/ai/evaluation/_common/onedp/_validation.py +50 -0
- azure/ai/evaluation/_common/onedp/_vendor.py +50 -0
- azure/ai/evaluation/_common/onedp/_version.py +9 -0
- azure/ai/evaluation/_common/onedp/aio/__init__.py +29 -0
- azure/ai/evaluation/_common/onedp/aio/_client.py +143 -0
- azure/ai/evaluation/_common/onedp/aio/_configuration.py +75 -0
- azure/ai/evaluation/_common/onedp/aio/_patch.py +21 -0
- azure/ai/evaluation/_common/onedp/aio/_vendor.py +40 -0
- azure/ai/evaluation/_common/onedp/aio/operations/__init__.py +39 -0
- azure/ai/evaluation/_common/onedp/aio/operations/_operations.py +4494 -0
- azure/ai/evaluation/_common/onedp/aio/operations/_patch.py +21 -0
- azure/ai/evaluation/_common/onedp/models/__init__.py +142 -0
- azure/ai/evaluation/_common/onedp/models/_enums.py +162 -0
- azure/ai/evaluation/_common/onedp/models/_models.py +2228 -0
- azure/ai/evaluation/_common/onedp/models/_patch.py +21 -0
- azure/ai/evaluation/_common/onedp/operations/__init__.py +39 -0
- azure/ai/evaluation/_common/onedp/operations/_operations.py +5655 -0
- azure/ai/evaluation/_common/onedp/operations/_patch.py +21 -0
- azure/ai/evaluation/_common/onedp/py.typed +1 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/__init__.py +1 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/aio/__init__.py +1 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/aio/operations/__init__.py +25 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/aio/operations/_operations.py +34 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/aio/operations/_patch.py +20 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/__init__.py +1 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/__init__.py +1 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/operations/__init__.py +22 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/operations/_operations.py +29 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/operations/_patch.py +20 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/operations/__init__.py +22 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/operations/_operations.py +29 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/operations/_patch.py +20 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/operations/__init__.py +25 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/operations/_operations.py +34 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/operations/_patch.py +20 -0
- azure/ai/evaluation/_common/rai_service.py +158 -28
- azure/ai/evaluation/_common/raiclient/_version.py +1 -1
- azure/ai/evaluation/_common/utils.py +79 -1
- azure/ai/evaluation/_constants.py +16 -0
- azure/ai/evaluation/_eval_mapping.py +71 -0
- azure/ai/evaluation/_evaluate/_batch_run/_run_submitter_client.py +30 -16
- azure/ai/evaluation/_evaluate/_batch_run/eval_run_context.py +8 -0
- azure/ai/evaluation/_evaluate/_batch_run/proxy_client.py +5 -0
- azure/ai/evaluation/_evaluate/_batch_run/target_run_context.py +17 -1
- azure/ai/evaluation/_evaluate/_eval_run.py +1 -1
- azure/ai/evaluation/_evaluate/_evaluate.py +325 -74
- azure/ai/evaluation/_evaluate/_evaluate_aoai.py +534 -0
- azure/ai/evaluation/_evaluate/_utils.py +117 -4
- azure/ai/evaluation/_evaluators/_common/_base_eval.py +8 -3
- azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +12 -3
- azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py +2 -2
- azure/ai/evaluation/_evaluators/_document_retrieval/__init__.py +11 -0
- azure/ai/evaluation/_evaluators/_document_retrieval/_document_retrieval.py +467 -0
- azure/ai/evaluation/_evaluators/_fluency/_fluency.py +1 -1
- azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +1 -1
- azure/ai/evaluation/_evaluators/_intent_resolution/_intent_resolution.py +6 -2
- azure/ai/evaluation/_evaluators/_relevance/_relevance.py +1 -1
- azure/ai/evaluation/_evaluators/_response_completeness/_response_completeness.py +7 -2
- azure/ai/evaluation/_evaluators/_response_completeness/response_completeness.prompty +31 -46
- azure/ai/evaluation/_evaluators/_similarity/_similarity.py +1 -1
- azure/ai/evaluation/_evaluators/_task_adherence/_task_adherence.py +5 -2
- azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py +6 -2
- azure/ai/evaluation/_exceptions.py +2 -0
- azure/ai/evaluation/_legacy/_adapters/__init__.py +0 -14
- azure/ai/evaluation/_legacy/_adapters/_check.py +17 -0
- azure/ai/evaluation/_legacy/_adapters/_flows.py +1 -1
- azure/ai/evaluation/_legacy/_batch_engine/_engine.py +51 -32
- azure/ai/evaluation/_legacy/_batch_engine/_openai_injector.py +114 -8
- azure/ai/evaluation/_legacy/_batch_engine/_result.py +6 -0
- azure/ai/evaluation/_legacy/_batch_engine/_run.py +6 -0
- azure/ai/evaluation/_legacy/_batch_engine/_run_submitter.py +69 -29
- azure/ai/evaluation/_legacy/_batch_engine/_trace.py +54 -62
- azure/ai/evaluation/_legacy/_batch_engine/_utils.py +19 -1
- azure/ai/evaluation/_legacy/_common/__init__.py +3 -0
- azure/ai/evaluation/_legacy/_common/_async_token_provider.py +124 -0
- azure/ai/evaluation/_legacy/_common/_thread_pool_executor_with_context.py +15 -0
- azure/ai/evaluation/_legacy/prompty/_connection.py +11 -74
- azure/ai/evaluation/_legacy/prompty/_exceptions.py +80 -0
- azure/ai/evaluation/_legacy/prompty/_prompty.py +119 -9
- azure/ai/evaluation/_legacy/prompty/_utils.py +72 -2
- azure/ai/evaluation/_safety_evaluation/_safety_evaluation.py +90 -17
- azure/ai/evaluation/_version.py +1 -1
- azure/ai/evaluation/red_team/_attack_strategy.py +1 -1
- azure/ai/evaluation/red_team/_red_team.py +825 -450
- azure/ai/evaluation/red_team/_utils/metric_mapping.py +23 -0
- azure/ai/evaluation/red_team/_utils/strategy_utils.py +1 -1
- azure/ai/evaluation/simulator/_adversarial_simulator.py +63 -39
- azure/ai/evaluation/simulator/_constants.py +1 -0
- azure/ai/evaluation/simulator/_conversation/__init__.py +13 -6
- azure/ai/evaluation/simulator/_conversation/_conversation.py +2 -1
- azure/ai/evaluation/simulator/_direct_attack_simulator.py +35 -22
- azure/ai/evaluation/simulator/_helpers/_language_suffix_mapping.py +1 -0
- azure/ai/evaluation/simulator/_indirect_attack_simulator.py +40 -25
- azure/ai/evaluation/simulator/_model_tools/__init__.py +2 -1
- azure/ai/evaluation/simulator/_model_tools/_generated_rai_client.py +24 -18
- azure/ai/evaluation/simulator/_model_tools/_identity_manager.py +5 -10
- azure/ai/evaluation/simulator/_model_tools/_proxy_completion_model.py +65 -41
- azure/ai/evaluation/simulator/_model_tools/_template_handler.py +9 -5
- azure/ai/evaluation/simulator/_model_tools/models.py +20 -17
- {azure_ai_evaluation-1.5.0.dist-info → azure_ai_evaluation-1.6.0.dist-info}/METADATA +25 -2
- {azure_ai_evaluation-1.5.0.dist-info → azure_ai_evaluation-1.6.0.dist-info}/RECORD +123 -65
- /azure/ai/evaluation/_legacy/{_batch_engine → _common}/_logging.py +0 -0
- {azure_ai_evaluation-1.5.0.dist-info → azure_ai_evaluation-1.6.0.dist-info}/NOTICE.txt +0 -0
- {azure_ai_evaluation-1.5.0.dist-info → azure_ai_evaluation-1.6.0.dist-info}/WHEEL +0 -0
- {azure_ai_evaluation-1.5.0.dist-info → azure_ai_evaluation-1.6.0.dist-info}/top_level.txt +0 -0
|
@@ -8,20 +8,26 @@ import os
|
|
|
8
8
|
import re
|
|
9
9
|
from typing import Any, Callable, Dict, List, Optional, Set, Tuple, TypedDict, Union, cast
|
|
10
10
|
|
|
11
|
+
from openai import OpenAI, AzureOpenAI
|
|
11
12
|
from azure.ai.evaluation._legacy._adapters._constants import LINE_NUMBER
|
|
12
13
|
from azure.ai.evaluation._legacy._adapters.entities import Run
|
|
13
14
|
import pandas as pd
|
|
14
15
|
|
|
15
16
|
from azure.ai.evaluation._common.math import list_mean_nan_safe, apply_transform_nan_safe
|
|
16
|
-
from azure.ai.evaluation._common.utils import validate_azure_ai_project
|
|
17
|
+
from azure.ai.evaluation._common.utils import validate_azure_ai_project, is_onedp_project
|
|
17
18
|
from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
|
|
18
19
|
|
|
20
|
+
from azure.ai.evaluation._aoai.aoai_grader import AzureOpenAIGrader
|
|
21
|
+
|
|
19
22
|
from .._constants import (
|
|
20
23
|
CONTENT_SAFETY_DEFECT_RATE_THRESHOLD_DEFAULT,
|
|
24
|
+
EVALUATION_PASS_FAIL_MAPPING,
|
|
21
25
|
EvaluationMetrics,
|
|
22
26
|
DefaultOpenEncoding,
|
|
23
27
|
Prefixes,
|
|
24
28
|
_InternalEvaluationMetrics,
|
|
29
|
+
BINARY_AGGREGATE_SUFFIX,
|
|
30
|
+
DEFAULT_OAI_EVAL_RUN_NAME
|
|
25
31
|
)
|
|
26
32
|
from .._model_configurations import AzureAIProject, EvaluationResult, EvaluatorConfig
|
|
27
33
|
from .._user_agent import USER_AGENT
|
|
@@ -29,7 +35,6 @@ from ._batch_run import (
|
|
|
29
35
|
EvalRunContext,
|
|
30
36
|
CodeClient,
|
|
31
37
|
ProxyClient,
|
|
32
|
-
ProxyRun,
|
|
33
38
|
TargetRunContext,
|
|
34
39
|
RunSubmitterClient,
|
|
35
40
|
)
|
|
@@ -38,16 +43,22 @@ from ._utils import (
|
|
|
38
43
|
_log_metrics_and_instance_results,
|
|
39
44
|
_trace_destination_from_project_scope,
|
|
40
45
|
_write_output,
|
|
41
|
-
DataLoaderFactory,
|
|
46
|
+
DataLoaderFactory, _log_metrics_and_instance_results_onedp,
|
|
42
47
|
)
|
|
43
|
-
from ._batch_run.batch_clients import BatchClient
|
|
48
|
+
from ._batch_run.batch_clients import BatchClient, BatchClientRun
|
|
44
49
|
|
|
50
|
+
from ._evaluate_aoai import (
|
|
51
|
+
_begin_aoai_evaluation,
|
|
52
|
+
_split_evaluators_and_grader_configs,
|
|
53
|
+
_get_evaluation_run_results,
|
|
54
|
+
OAIEvalRunCreationInfo
|
|
55
|
+
)
|
|
45
56
|
LOGGER = logging.getLogger(__name__)
|
|
46
57
|
|
|
47
58
|
# For metrics (aggregates) whose metric names intentionally differ from their
|
|
48
59
|
# originating column name, usually because the aggregation of the original value
|
|
49
60
|
# means something sufficiently different.
|
|
50
|
-
# Note that content safety metrics are handled
|
|
61
|
+
# Note that content safety metrics are handled separately.
|
|
51
62
|
METRIC_COLUMN_NAME_REPLACEMENTS = {
|
|
52
63
|
"groundedness_pro_label": "groundedness_pro_passing_rate",
|
|
53
64
|
}
|
|
@@ -58,6 +69,19 @@ class __EvaluatorInfo(TypedDict):
|
|
|
58
69
|
metrics: Dict[str, Any]
|
|
59
70
|
run_summary: Dict[str, Any]
|
|
60
71
|
|
|
72
|
+
class __ValidatedData(TypedDict):
|
|
73
|
+
'''
|
|
74
|
+
Simple dictionary that contains ALL pre-processed data and
|
|
75
|
+
the resultant objects that are needed for downstream evaluation.
|
|
76
|
+
'''
|
|
77
|
+
evaluators: Dict[str, Callable]
|
|
78
|
+
graders: Dict[str, AzureOpenAIGrader]
|
|
79
|
+
input_data_df: pd.DataFrame
|
|
80
|
+
column_mapping: Dict[str, Dict[str, str]]
|
|
81
|
+
target_run: Optional[BatchClientRun]
|
|
82
|
+
batch_run_client: BatchClient
|
|
83
|
+
batch_run_data: Union[str, os.PathLike, pd.DataFrame]
|
|
84
|
+
|
|
61
85
|
|
|
62
86
|
def _aggregate_other_metrics(df: pd.DataFrame) -> Tuple[List[str], Dict[str, float]]:
|
|
63
87
|
"""Identify and average various metrics that need to have the metric name be replaced,
|
|
@@ -208,6 +232,48 @@ def _process_rows(row, detail_defect_rates):
|
|
|
208
232
|
return detail_defect_rates
|
|
209
233
|
|
|
210
234
|
|
|
235
|
+
def _aggregation_binary_output(df: pd.DataFrame) -> Dict[str, float]:
|
|
236
|
+
"""
|
|
237
|
+
Aggregate binary output results (pass/fail) from evaluation dataframe.
|
|
238
|
+
|
|
239
|
+
For each evaluator, calculates the proportion of "pass" results.
|
|
240
|
+
|
|
241
|
+
:param df: The dataframe of evaluation results.
|
|
242
|
+
:type df: ~pandas.DataFrame
|
|
243
|
+
:return: A dictionary mapping evaluator names to the proportion of pass results.
|
|
244
|
+
:rtype: Dict[str, float]
|
|
245
|
+
"""
|
|
246
|
+
results = {}
|
|
247
|
+
|
|
248
|
+
# Find all columns that end with "_result"
|
|
249
|
+
result_columns = [col for col in df.columns if col.startswith("outputs.") and col.endswith("_result")]
|
|
250
|
+
|
|
251
|
+
for col in result_columns:
|
|
252
|
+
# Extract the evaluator name from the column name
|
|
253
|
+
# (outputs.<evaluator>.<metric>_result)
|
|
254
|
+
parts = col.split(".")
|
|
255
|
+
evaluator_name = None
|
|
256
|
+
if len(parts) >= 3:
|
|
257
|
+
evaluator_name = parts[1]
|
|
258
|
+
else:
|
|
259
|
+
LOGGER.warning("Skipping column '%s' due to unexpected format. Expected at least three parts separated by '.'", col)
|
|
260
|
+
continue
|
|
261
|
+
if evaluator_name:
|
|
262
|
+
# Count the occurrences of each unique value (pass/fail)
|
|
263
|
+
value_counts = df[col].value_counts().to_dict()
|
|
264
|
+
|
|
265
|
+
# Calculate the proportion of EVALUATION_PASS_FAIL_MAPPING[True] results
|
|
266
|
+
total_rows = len(df)
|
|
267
|
+
pass_count = value_counts.get(EVALUATION_PASS_FAIL_MAPPING[True], 0)
|
|
268
|
+
proportion = pass_count / total_rows if total_rows > 0 else 0.0
|
|
269
|
+
|
|
270
|
+
# Set the result with the evaluator name as the key
|
|
271
|
+
result_key = f"{evaluator_name}.{BINARY_AGGREGATE_SUFFIX}"
|
|
272
|
+
results[result_key] = round(proportion, 2)
|
|
273
|
+
|
|
274
|
+
return results
|
|
275
|
+
|
|
276
|
+
|
|
211
277
|
def _aggregate_metrics(df: pd.DataFrame, evaluators: Dict[str, Callable]) -> Dict[str, float]:
|
|
212
278
|
"""Aggregate metrics from the evaluation results.
|
|
213
279
|
On top of naively calculating the mean of most metrics, this function also identifies certain columns
|
|
@@ -221,6 +287,8 @@ def _aggregate_metrics(df: pd.DataFrame, evaluators: Dict[str, Callable]) -> Dic
|
|
|
221
287
|
:return: The aggregated metrics.
|
|
222
288
|
:rtype: Dict[str, float]
|
|
223
289
|
"""
|
|
290
|
+
binary_metrics = _aggregation_binary_output(df)
|
|
291
|
+
|
|
224
292
|
df.rename(columns={col: col.replace("outputs.", "") for col in df.columns}, inplace=True)
|
|
225
293
|
|
|
226
294
|
handled_columns = []
|
|
@@ -248,6 +316,10 @@ def _aggregate_metrics(df: pd.DataFrame, evaluators: Dict[str, Callable]) -> Dic
|
|
|
248
316
|
metrics = mean_value.to_dict()
|
|
249
317
|
# Add defect rates back into metrics
|
|
250
318
|
metrics.update(defect_rates)
|
|
319
|
+
|
|
320
|
+
# Add binary threshold metrics based on pass/fail results
|
|
321
|
+
metrics.update(binary_metrics)
|
|
322
|
+
|
|
251
323
|
return metrics
|
|
252
324
|
|
|
253
325
|
|
|
@@ -486,12 +558,12 @@ def _validate_and_load_data(target, data, evaluators, output_path, azure_ai_proj
|
|
|
486
558
|
|
|
487
559
|
def _apply_target_to_data(
|
|
488
560
|
target: Callable,
|
|
489
|
-
data: Union[str, os.PathLike],
|
|
561
|
+
data: Union[str, os.PathLike, pd.DataFrame],
|
|
490
562
|
batch_client: BatchClient,
|
|
491
563
|
initial_data: pd.DataFrame,
|
|
492
564
|
evaluation_name: Optional[str] = None,
|
|
493
565
|
**kwargs,
|
|
494
|
-
) -> Tuple[pd.DataFrame, Set[str],
|
|
566
|
+
) -> Tuple[pd.DataFrame, Set[str], BatchClientRun]:
|
|
495
567
|
"""
|
|
496
568
|
Apply the target function to the data set and return updated data and generated columns.
|
|
497
569
|
|
|
@@ -509,24 +581,18 @@ def _apply_target_to_data(
|
|
|
509
581
|
:rtype: Tuple[pandas.DataFrame, List[str]]
|
|
510
582
|
"""
|
|
511
583
|
|
|
512
|
-
if not isinstance(batch_client, ProxyClient):
|
|
513
|
-
raise ValueError("Only ProxyClient supports target runs for now.")
|
|
514
|
-
|
|
515
584
|
_run_name = kwargs.get("_run_name")
|
|
516
|
-
with TargetRunContext():
|
|
517
|
-
run =
|
|
518
|
-
|
|
519
|
-
|
|
520
|
-
|
|
521
|
-
|
|
522
|
-
|
|
523
|
-
|
|
524
|
-
name=_run_name,
|
|
525
|
-
),
|
|
585
|
+
with TargetRunContext(batch_client):
|
|
586
|
+
run: BatchClientRun = batch_client.run(
|
|
587
|
+
flow=target,
|
|
588
|
+
display_name=evaluation_name,
|
|
589
|
+
data=data,
|
|
590
|
+
stream=True,
|
|
591
|
+
name=_run_name,
|
|
592
|
+
evaluator_name=getattr(target, "__qualname__", "TARGET"),
|
|
526
593
|
)
|
|
527
|
-
|
|
528
|
-
|
|
529
|
-
run_summary = batch_client.get_run_summary(run)
|
|
594
|
+
target_output: pd.DataFrame = batch_client.get_details(run, all_results=True)
|
|
595
|
+
run_summary = batch_client.get_run_summary(run)
|
|
530
596
|
|
|
531
597
|
if run_summary["completed_lines"] == 0:
|
|
532
598
|
msg = (
|
|
@@ -557,7 +623,7 @@ def _apply_target_to_data(
|
|
|
557
623
|
# Concatenate output to input
|
|
558
624
|
target_output = pd.concat([target_output, initial_data], axis=1)
|
|
559
625
|
|
|
560
|
-
return target_output, generated_columns, run
|
|
626
|
+
return target_output, generated_columns, run
|
|
561
627
|
|
|
562
628
|
|
|
563
629
|
def _process_column_mappings(
|
|
@@ -573,7 +639,7 @@ def _process_column_mappings(
|
|
|
573
639
|
|
|
574
640
|
processed_config: Dict[str, Dict[str, str]] = {}
|
|
575
641
|
|
|
576
|
-
expected_references = re.compile(r"^\$\{(target|data)\.[a-zA-
|
|
642
|
+
expected_references = re.compile(r"^\$\{(target|data)\.[a-zA-Z0-9_]+\}$")
|
|
577
643
|
|
|
578
644
|
if column_mapping:
|
|
579
645
|
for evaluator, mapping_config in column_mapping.items():
|
|
@@ -625,11 +691,11 @@ def _rename_columns_conditionally(df: pd.DataFrame) -> pd.DataFrame:
|
|
|
625
691
|
def evaluate(
|
|
626
692
|
*,
|
|
627
693
|
data: Union[str, os.PathLike],
|
|
628
|
-
evaluators: Dict[str, Callable],
|
|
694
|
+
evaluators: Dict[str, Union[Callable, AzureOpenAIGrader]],
|
|
629
695
|
evaluation_name: Optional[str] = None,
|
|
630
696
|
target: Optional[Callable] = None,
|
|
631
697
|
evaluator_config: Optional[Dict[str, EvaluatorConfig]] = None,
|
|
632
|
-
azure_ai_project: Optional[AzureAIProject] = None,
|
|
698
|
+
azure_ai_project: Optional[Union[str, AzureAIProject]] = None,
|
|
633
699
|
output_path: Optional[Union[str, os.PathLike]] = None,
|
|
634
700
|
fail_on_evaluator_errors: bool = False,
|
|
635
701
|
**kwargs,
|
|
@@ -641,8 +707,9 @@ def evaluate(
|
|
|
641
707
|
JSONL and CSV files are supported. `target` and `data` both cannot be None. Required.
|
|
642
708
|
:paramtype data: str
|
|
643
709
|
:keyword evaluators: Evaluators to be used for evaluation. It should be a dictionary with key as alias for evaluator
|
|
644
|
-
and value as the evaluator function.
|
|
645
|
-
|
|
710
|
+
and value as the evaluator function. Also accepts AzureOpenAIGrader instances as values, which are processed separately.
|
|
711
|
+
Required.
|
|
712
|
+
:paramtype evaluators: Dict[str, Union[Callable, ~azure.ai.evaluation.AzureOpenAIGrader]]
|
|
646
713
|
:keyword evaluation_name: Display name of the evaluation.
|
|
647
714
|
:paramtype evaluation_name: Optional[str]
|
|
648
715
|
:keyword target: Target to be evaluated. `target` and `data` both cannot be None
|
|
@@ -679,7 +746,7 @@ def evaluate(
|
|
|
679
746
|
evaluation_name=evaluation_name,
|
|
680
747
|
target=target,
|
|
681
748
|
data=data,
|
|
682
|
-
|
|
749
|
+
evaluators_and_graders=evaluators,
|
|
683
750
|
evaluator_config=evaluator_config,
|
|
684
751
|
azure_ai_project=azure_ai_project,
|
|
685
752
|
output_path=output_path,
|
|
@@ -744,23 +811,157 @@ def _print_fail_flag_warning() -> None:
|
|
|
744
811
|
|
|
745
812
|
def _evaluate( # pylint: disable=too-many-locals,too-many-statements
|
|
746
813
|
*,
|
|
747
|
-
|
|
814
|
+
evaluators_and_graders: Dict[str, Union[Callable, AzureOpenAIGrader]],
|
|
748
815
|
evaluation_name: Optional[str] = None,
|
|
749
816
|
target: Optional[Callable] = None,
|
|
750
817
|
data: Union[str, os.PathLike],
|
|
751
818
|
evaluator_config: Optional[Dict[str, EvaluatorConfig]] = None,
|
|
752
|
-
azure_ai_project: Optional[AzureAIProject] = None,
|
|
819
|
+
azure_ai_project: Optional[Union[str, AzureAIProject]] = None,
|
|
753
820
|
output_path: Optional[Union[str, os.PathLike]] = None,
|
|
754
821
|
fail_on_evaluator_errors: bool = False,
|
|
755
822
|
**kwargs,
|
|
756
823
|
) -> EvaluationResult:
|
|
757
824
|
if fail_on_evaluator_errors:
|
|
758
825
|
_print_fail_flag_warning()
|
|
759
|
-
|
|
826
|
+
|
|
827
|
+
# Turn inputted mess of data into a dataframe, apply targets if needed
|
|
828
|
+
# split graders and evaluators, and verify that column mappings are sensible.
|
|
829
|
+
validated_data = _preprocess_data(
|
|
830
|
+
data=data,
|
|
831
|
+
evaluators_and_graders=evaluators_and_graders,
|
|
832
|
+
evaluator_config=evaluator_config,
|
|
833
|
+
target=target,
|
|
834
|
+
output_path=output_path,
|
|
835
|
+
azure_ai_project=azure_ai_project,
|
|
836
|
+
evaluation_name=evaluation_name,
|
|
837
|
+
**kwargs,
|
|
838
|
+
)
|
|
839
|
+
|
|
840
|
+
# extract relevant info from validated data
|
|
841
|
+
column_mapping = validated_data["column_mapping"]
|
|
842
|
+
evaluators = validated_data["evaluators"]
|
|
843
|
+
graders = validated_data["graders"]
|
|
844
|
+
input_data_df = validated_data["input_data_df"]
|
|
845
|
+
results_df = pd.DataFrame()
|
|
846
|
+
metrics: Dict[str, float] = {}
|
|
847
|
+
eval_run_info_list: List[OAIEvalRunCreationInfo] = []
|
|
848
|
+
|
|
849
|
+
# Start OAI eval runs if any graders are present.
|
|
850
|
+
need_oai_run = len(graders) > 0
|
|
851
|
+
need_local_run = len(evaluators) > 0
|
|
852
|
+
need_get_oai_results = False
|
|
853
|
+
got_local_results = False
|
|
854
|
+
if need_oai_run:
|
|
855
|
+
try:
|
|
856
|
+
aoi_name = evaluation_name if evaluation_name else DEFAULT_OAI_EVAL_RUN_NAME
|
|
857
|
+
eval_run_info_list = _begin_aoai_evaluation(
|
|
858
|
+
graders,
|
|
859
|
+
column_mapping,
|
|
860
|
+
input_data_df,
|
|
861
|
+
aoi_name
|
|
862
|
+
)
|
|
863
|
+
need_get_oai_results = len(eval_run_info_list) > 0
|
|
864
|
+
except EvaluationException as e:
|
|
865
|
+
if need_local_run:
|
|
866
|
+
# If there are normal evaluators, don't stop execution and try to run
|
|
867
|
+
# those.
|
|
868
|
+
LOGGER.warning("Remote Azure Open AI grader evaluations failed during run creation." +
|
|
869
|
+
" Continuing with local evaluators.")
|
|
870
|
+
LOGGER.warning(e)
|
|
871
|
+
else:
|
|
872
|
+
raise e
|
|
873
|
+
|
|
874
|
+
# Evaluate 'normal' evaluators. This includes built-in evaluators and any user-supplied callables.
|
|
875
|
+
if need_local_run:
|
|
876
|
+
try:
|
|
877
|
+
eval_result_df, eval_metrics, per_evaluator_results = _run_callable_evaluators(
|
|
878
|
+
validated_data=validated_data,
|
|
879
|
+
fail_on_evaluator_errors=fail_on_evaluator_errors
|
|
880
|
+
)
|
|
881
|
+
results_df = eval_result_df
|
|
882
|
+
metrics = eval_metrics
|
|
883
|
+
got_local_results = True
|
|
884
|
+
# TODO figure out how to update this printing to include OAI results?
|
|
885
|
+
_print_summary(per_evaluator_results)
|
|
886
|
+
except EvaluationException as e:
|
|
887
|
+
if need_get_oai_results:
|
|
888
|
+
# If there are OAI graders, we only print a warning on local failures.
|
|
889
|
+
LOGGER.warning("Local evaluations failed. Will still attempt to retrieve online grader results.")
|
|
890
|
+
LOGGER.warning(e)
|
|
891
|
+
else:
|
|
892
|
+
raise e
|
|
893
|
+
|
|
894
|
+
# Retrieve OAI eval run results if needed.
|
|
895
|
+
if need_get_oai_results:
|
|
896
|
+
try:
|
|
897
|
+
aoai_results, aoai_metrics = _get_evaluation_run_results(eval_run_info_list) # type: ignore
|
|
898
|
+
# Post build TODO: add equivalent of _print_summary(per_evaluator_results) here
|
|
760
899
|
|
|
900
|
+
# Combine results if both evaluators and graders are present
|
|
901
|
+
if len(evaluators) > 0:
|
|
902
|
+
results_df = pd.concat([results_df, aoai_results], axis=1)
|
|
903
|
+
metrics.update(aoai_metrics)
|
|
904
|
+
else:
|
|
905
|
+
# Otherwise combine aoai results with input data df to include input columns in outputs.
|
|
906
|
+
results_df = pd.concat([input_data_df, aoai_results], axis=1)
|
|
907
|
+
metrics = aoai_metrics
|
|
908
|
+
except EvaluationException as e:
|
|
909
|
+
if got_local_results:
|
|
910
|
+
# If there are local eval results, we only print a warning on OAI failure.
|
|
911
|
+
LOGGER.warning("Remote Azure Open AI grader evaluations failed. Still returning local results.")
|
|
912
|
+
LOGGER.warning(e)
|
|
913
|
+
else:
|
|
914
|
+
raise e
|
|
915
|
+
|
|
916
|
+
# Done with all evaluations, message outputs into final forms, and log results if needed.
|
|
917
|
+
name_map = _map_names_to_builtins(evaluators, graders)
|
|
918
|
+
if is_onedp_project(azure_ai_project):
|
|
919
|
+
studio_url = _log_metrics_and_instance_results_onedp(
|
|
920
|
+
metrics, results_df, azure_ai_project, evaluation_name, name_map, **kwargs
|
|
921
|
+
)
|
|
922
|
+
else:
|
|
923
|
+
# Since tracing is disabled, pass None for target_run so a dummy evaluation run will be created each time.
|
|
924
|
+
trace_destination = _trace_destination_from_project_scope(azure_ai_project) if azure_ai_project else None
|
|
925
|
+
studio_url = None
|
|
926
|
+
if trace_destination:
|
|
927
|
+
studio_url = _log_metrics_and_instance_results(
|
|
928
|
+
metrics, results_df, trace_destination, None, evaluation_name, name_map, **kwargs
|
|
929
|
+
)
|
|
930
|
+
|
|
931
|
+
result_df_dict = results_df.to_dict("records")
|
|
932
|
+
result: EvaluationResult = {"rows": result_df_dict, "metrics": metrics, "studio_url": studio_url} # type: ignore
|
|
933
|
+
|
|
934
|
+
if output_path:
|
|
935
|
+
_write_output(output_path, result)
|
|
936
|
+
|
|
937
|
+
return result
|
|
938
|
+
|
|
939
|
+
|
|
940
|
+
def _preprocess_data(
|
|
941
|
+
data: Union[str, os.PathLike],
|
|
942
|
+
evaluators_and_graders: Dict[str, Union[Callable, AzureOpenAIGrader]],
|
|
943
|
+
evaluator_config: Optional[Dict[str, EvaluatorConfig]] = None,
|
|
944
|
+
target: Optional[Callable] = None,
|
|
945
|
+
output_path: Optional[Union[str, os.PathLike]] = None,
|
|
946
|
+
azure_ai_project: Optional[Union[str, AzureAIProject]] = None,
|
|
947
|
+
evaluation_name: Optional[str] = None,
|
|
948
|
+
**kwargs,
|
|
949
|
+
) -> __ValidatedData:
|
|
761
950
|
# Process evaluator config to replace ${target.} with ${data.}
|
|
762
951
|
if evaluator_config is None:
|
|
763
952
|
evaluator_config = {}
|
|
953
|
+
|
|
954
|
+
input_data_df = _validate_and_load_data(
|
|
955
|
+
target,
|
|
956
|
+
data,
|
|
957
|
+
evaluators_and_graders,
|
|
958
|
+
output_path,
|
|
959
|
+
azure_ai_project,
|
|
960
|
+
evaluation_name
|
|
961
|
+
)
|
|
962
|
+
if target is not None:
|
|
963
|
+
_validate_columns_for_target(input_data_df, target)
|
|
964
|
+
|
|
764
965
|
# extract column mapping dicts into dictionary mapping evaluator name to column mapping
|
|
765
966
|
column_mapping = _process_column_mappings(
|
|
766
967
|
{
|
|
@@ -769,27 +970,46 @@ def _evaluate( # pylint: disable=too-many-locals,too-many-statements
|
|
|
769
970
|
}
|
|
770
971
|
)
|
|
771
972
|
|
|
772
|
-
if target is not None:
|
|
773
|
-
_validate_columns_for_target(input_data_df, target)
|
|
774
|
-
|
|
775
973
|
# Create default configuration for evaluators that directly maps
|
|
776
974
|
# input data names to keyword inputs of the same name in the evaluators.
|
|
777
975
|
column_mapping = column_mapping or {}
|
|
778
976
|
column_mapping.setdefault("default", {})
|
|
779
977
|
|
|
780
|
-
|
|
978
|
+
# Split normal evaluators and OAI graders
|
|
979
|
+
evaluators, graders = _split_evaluators_and_grader_configs(evaluators_and_graders)
|
|
980
|
+
|
|
981
|
+
input_data_df = _validate_and_load_data(
|
|
982
|
+
target,
|
|
983
|
+
data,
|
|
984
|
+
evaluators_and_graders,
|
|
985
|
+
output_path,
|
|
986
|
+
azure_ai_project,
|
|
987
|
+
evaluation_name
|
|
988
|
+
)
|
|
989
|
+
if target is not None:
|
|
990
|
+
_validate_columns_for_target(input_data_df, target)
|
|
991
|
+
|
|
992
|
+
target_run: Optional[BatchClientRun] = None
|
|
781
993
|
target_generated_columns: Set[str] = set()
|
|
782
994
|
batch_run_client: BatchClient
|
|
783
995
|
batch_run_data: Union[str, os.PathLike, pd.DataFrame] = data
|
|
784
996
|
|
|
785
|
-
|
|
786
|
-
|
|
787
|
-
|
|
997
|
+
if kwargs.pop("_use_run_submitter_client", False):
|
|
998
|
+
batch_run_client = RunSubmitterClient()
|
|
999
|
+
batch_run_data = input_data_df
|
|
1000
|
+
elif kwargs.pop("_use_pf_client", True):
|
|
788
1001
|
batch_run_client = ProxyClient(user_agent=USER_AGENT)
|
|
1002
|
+
# Ensure the absolute path is passed to pf.run, as relative path doesn't work with
|
|
1003
|
+
# multiple evaluators. If the path is already absolute, abspath will return the original path.
|
|
789
1004
|
batch_run_data = os.path.abspath(data)
|
|
1005
|
+
else:
|
|
1006
|
+
batch_run_client = CodeClient()
|
|
1007
|
+
batch_run_data = input_data_df
|
|
790
1008
|
|
|
1009
|
+
# If target is set, apply 1-1 column mapping from target outputs to evaluator inputs
|
|
1010
|
+
if data is not None and target is not None:
|
|
791
1011
|
input_data_df, target_generated_columns, target_run = _apply_target_to_data(
|
|
792
|
-
target,
|
|
1012
|
+
target, batch_run_data, batch_run_client, input_data_df, evaluation_name, **kwargs
|
|
793
1013
|
)
|
|
794
1014
|
|
|
795
1015
|
for evaluator_name, mapping in column_mapping.items():
|
|
@@ -803,17 +1023,6 @@ def _evaluate( # pylint: disable=too-many-locals,too-many-statements
|
|
|
803
1023
|
# customer did not mapped target output.
|
|
804
1024
|
if col not in mapping and run_output not in mapped_to_values:
|
|
805
1025
|
column_mapping[evaluator_name][col] = run_output # pylint: disable=unnecessary-dict-index-lookup
|
|
806
|
-
elif kwargs.pop("_use_run_submitter_client", False):
|
|
807
|
-
batch_run_client = RunSubmitterClient()
|
|
808
|
-
batch_run_data = input_data_df
|
|
809
|
-
elif kwargs.pop("_use_pf_client", True):
|
|
810
|
-
batch_run_client = ProxyClient(user_agent=USER_AGENT)
|
|
811
|
-
# Ensure the absolute path is passed to pf.run, as relative path doesn't work with
|
|
812
|
-
# multiple evaluators. If the path is already absolute, abspath will return the original path.
|
|
813
|
-
batch_run_data = os.path.abspath(data)
|
|
814
|
-
else:
|
|
815
|
-
batch_run_client = CodeClient()
|
|
816
|
-
batch_run_data = input_data_df
|
|
817
1026
|
|
|
818
1027
|
# After we have generated all columns, we can check if we have everything we need for evaluators.
|
|
819
1028
|
_validate_columns_for_evaluators(input_data_df, evaluators, target, target_generated_columns, column_mapping)
|
|
@@ -829,6 +1038,29 @@ def _evaluate( # pylint: disable=too-many-locals,too-many-statements
|
|
|
829
1038
|
if not col.startswith(Prefixes.TSG_OUTPUTS) and col not in column_mapping["default"].keys():
|
|
830
1039
|
column_mapping["default"][col] = f"${{data.{col}}}"
|
|
831
1040
|
|
|
1041
|
+
return __ValidatedData(
|
|
1042
|
+
evaluators=evaluators,
|
|
1043
|
+
graders=graders,
|
|
1044
|
+
input_data_df=input_data_df,
|
|
1045
|
+
column_mapping=column_mapping,
|
|
1046
|
+
target_run=target_run,
|
|
1047
|
+
batch_run_client=batch_run_client,
|
|
1048
|
+
batch_run_data=batch_run_data,
|
|
1049
|
+
)
|
|
1050
|
+
|
|
1051
|
+
|
|
1052
|
+
def _run_callable_evaluators(
|
|
1053
|
+
validated_data: __ValidatedData,
|
|
1054
|
+
fail_on_evaluator_errors: bool = False,
|
|
1055
|
+
**kwargs,
|
|
1056
|
+
) -> Tuple[pd.DataFrame, Dict[str, Any], Dict[str, __EvaluatorInfo]]:
|
|
1057
|
+
|
|
1058
|
+
# Extract needed values
|
|
1059
|
+
batch_run_client = validated_data["batch_run_client"]
|
|
1060
|
+
target_run = validated_data["target_run"]
|
|
1061
|
+
batch_run_data = validated_data["batch_run_data"]
|
|
1062
|
+
column_mapping = validated_data["column_mapping"]
|
|
1063
|
+
evaluators = validated_data["evaluators"]
|
|
832
1064
|
with EvalRunContext(batch_run_client):
|
|
833
1065
|
runs = {
|
|
834
1066
|
evaluator_name: batch_run_client.run(
|
|
@@ -889,31 +1121,50 @@ def _evaluate( # pylint: disable=too-many-locals,too-many-statements
|
|
|
889
1121
|
# Rename columns, generated by target function to outputs instead of inputs.
|
|
890
1122
|
# If target generates columns, already present in the input data, these columns
|
|
891
1123
|
# will be marked as outputs already so we do not need to rename them.
|
|
892
|
-
input_data_df = _rename_columns_conditionally(input_data_df)
|
|
893
|
-
|
|
894
|
-
result_df = pd.concat([input_data_df, evaluators_result_df], axis=1, verify_integrity=True)
|
|
895
|
-
metrics = _aggregate_metrics(evaluators_result_df, evaluators)
|
|
896
|
-
metrics.update(evaluators_metric)
|
|
897
|
-
|
|
898
|
-
# Since tracing is disabled, pass None for target_run so a dummy evaluation run will be created each time.
|
|
899
|
-
target_run: Optional[Run] = None
|
|
900
|
-
trace_destination = _trace_destination_from_project_scope(azure_ai_project) if azure_ai_project else None
|
|
901
|
-
studio_url = None
|
|
902
|
-
if trace_destination:
|
|
903
|
-
studio_url = _log_metrics_and_instance_results(
|
|
904
|
-
metrics, result_df, trace_destination, target_run, evaluation_name, **kwargs
|
|
905
|
-
)
|
|
906
1124
|
|
|
907
|
-
|
|
908
|
-
|
|
1125
|
+
input_data_df = _rename_columns_conditionally(validated_data["input_data_df"])
|
|
1126
|
+
eval_result_df = pd.concat([input_data_df, evaluators_result_df], axis=1, verify_integrity=True)
|
|
1127
|
+
eval_metrics = _aggregate_metrics(evaluators_result_df, evaluators)
|
|
1128
|
+
eval_metrics.update(evaluators_metric)
|
|
909
1129
|
|
|
910
|
-
|
|
911
|
-
|
|
912
|
-
if output_path:
|
|
913
|
-
_write_output(output_path, result)
|
|
1130
|
+
return eval_result_df, eval_metrics, per_evaluator_results
|
|
914
1131
|
|
|
915
|
-
|
|
1132
|
+
def _map_names_to_builtins(
|
|
1133
|
+
evaluators: Dict[str, Callable],
|
|
1134
|
+
graders: Dict[str, AzureOpenAIGrader],
|
|
1135
|
+
) -> Dict[str, str]:
|
|
1136
|
+
"""
|
|
1137
|
+
Construct a mapping from user-supplied evaluator names to which known, built-in
|
|
1138
|
+
evaluator or grader they refer to. Custom or otherwise unknown evaluators are
|
|
1139
|
+
mapped to the "unknown" value.
|
|
916
1140
|
|
|
1141
|
+
:param evaluators: The dictionary of evaluators.
|
|
1142
|
+
:type evaluators: Dict[str, Callable]
|
|
1143
|
+
:param graders: The dictionary of graders.
|
|
1144
|
+
:type graders: Dict[str, AzureOpenAIGrader]
|
|
1145
|
+
:param evaluator_config: The configuration for evaluators.
|
|
1146
|
+
:type evaluator_config: Optional[Dict[str, EvaluatorConfig]]
|
|
1147
|
+
|
|
1148
|
+
"""
|
|
1149
|
+
from .._eval_mapping import EVAL_CLASS_MAP
|
|
1150
|
+
name_map = {}
|
|
1151
|
+
|
|
1152
|
+
for name, evaluator in evaluators.items():
|
|
1153
|
+
# Check if the evaluator is a known built-in evaluator
|
|
1154
|
+
found_eval = False
|
|
1155
|
+
for eval_class, eval_id in EVAL_CLASS_MAP.items():
|
|
1156
|
+
if isinstance(evaluator, eval_class):
|
|
1157
|
+
name_map[name] = eval_id
|
|
1158
|
+
found_eval = True
|
|
1159
|
+
break
|
|
1160
|
+
if not found_eval:
|
|
1161
|
+
# If not found, map to "unknown"
|
|
1162
|
+
name_map[name] = "unknown"
|
|
1163
|
+
|
|
1164
|
+
for name, grader in graders.items():
|
|
1165
|
+
name_map[name] = grader.id
|
|
1166
|
+
|
|
1167
|
+
return name_map
|
|
917
1168
|
|
|
918
1169
|
def _turn_error_logs_into_exception(log_path: str) -> None:
|
|
919
1170
|
"""Produce an EvaluationException using the contents of the inputted
|
|
@@ -929,4 +1180,4 @@ def _turn_error_logs_into_exception(log_path: str) -> None:
|
|
|
929
1180
|
target=ErrorTarget.EVALUATE,
|
|
930
1181
|
category=ErrorCategory.FAILED_EXECUTION,
|
|
931
1182
|
blame=ErrorBlame.UNKNOWN,
|
|
932
|
-
)
|
|
1183
|
+
)
|