azure-ai-evaluation 1.0.0b2__py3-none-any.whl → 1.0.0b4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of azure-ai-evaluation might be problematic. Click here for more details.
- azure/ai/evaluation/__init__.py +9 -5
- azure/ai/evaluation/_common/constants.py +4 -2
- azure/ai/evaluation/_common/math.py +18 -0
- azure/ai/evaluation/_common/rai_service.py +54 -62
- azure/ai/evaluation/_common/utils.py +201 -16
- azure/ai/evaluation/_constants.py +12 -0
- azure/ai/evaluation/_evaluate/_batch_run_client/batch_run_context.py +10 -3
- azure/ai/evaluation/_evaluate/_batch_run_client/code_client.py +33 -17
- azure/ai/evaluation/_evaluate/_batch_run_client/proxy_client.py +17 -2
- azure/ai/evaluation/_evaluate/_eval_run.py +26 -10
- azure/ai/evaluation/_evaluate/_evaluate.py +161 -89
- azure/ai/evaluation/_evaluate/_telemetry/__init__.py +16 -17
- azure/ai/evaluation/_evaluate/_utils.py +44 -25
- azure/ai/evaluation/_evaluators/_coherence/_coherence.py +33 -79
- azure/ai/evaluation/_evaluators/_coherence/coherence.prompty +0 -5
- azure/ai/evaluation/_evaluators/_common/__init__.py +13 -0
- azure/ai/evaluation/_evaluators/_common/_base_eval.py +331 -0
- azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +76 -0
- azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py +97 -0
- azure/ai/evaluation/_evaluators/_content_safety/__init__.py +0 -4
- azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py +15 -20
- azure/ai/evaluation/_evaluators/_content_safety/_content_safety_chat.py +63 -42
- azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py +18 -41
- azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py +18 -39
- azure/ai/evaluation/_evaluators/_content_safety/_sexual.py +18 -39
- azure/ai/evaluation/_evaluators/_content_safety/_violence.py +18 -39
- azure/ai/evaluation/_evaluators/_eci/_eci.py +18 -55
- azure/ai/evaluation/_evaluators/_f1_score/_f1_score.py +14 -6
- azure/ai/evaluation/_evaluators/_fluency/_fluency.py +30 -74
- azure/ai/evaluation/_evaluators/_fluency/fluency.prompty +0 -5
- azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +34 -80
- azure/ai/evaluation/_evaluators/_groundedness/groundedness.prompty +0 -5
- azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py +18 -65
- azure/ai/evaluation/_evaluators/_qa/_qa.py +4 -3
- azure/ai/evaluation/_evaluators/_relevance/_relevance.py +35 -83
- azure/ai/evaluation/_evaluators/_relevance/relevance.prompty +0 -5
- azure/ai/evaluation/_evaluators/{_chat → _retrieval}/__init__.py +2 -2
- azure/ai/evaluation/_evaluators/{_chat/retrieval → _retrieval}/_retrieval.py +25 -28
- azure/ai/evaluation/_evaluators/{_chat/retrieval → _retrieval}/retrieval.prompty +0 -5
- azure/ai/evaluation/_evaluators/_rouge/_rouge.py +1 -1
- azure/ai/evaluation/_evaluators/_similarity/_similarity.py +23 -17
- azure/ai/evaluation/_evaluators/_similarity/similarity.prompty +0 -5
- azure/ai/evaluation/_evaluators/_xpia/xpia.py +15 -90
- azure/ai/evaluation/_exceptions.py +9 -7
- azure/ai/evaluation/_http_utils.py +203 -132
- azure/ai/evaluation/_model_configurations.py +37 -9
- azure/ai/evaluation/{_evaluators/_chat/retrieval → _vendor}/__init__.py +0 -6
- azure/ai/evaluation/_vendor/rouge_score/__init__.py +14 -0
- azure/ai/evaluation/_vendor/rouge_score/rouge_scorer.py +328 -0
- azure/ai/evaluation/_vendor/rouge_score/scoring.py +63 -0
- azure/ai/evaluation/_vendor/rouge_score/tokenize.py +63 -0
- azure/ai/evaluation/_vendor/rouge_score/tokenizers.py +53 -0
- azure/ai/evaluation/_version.py +1 -1
- azure/ai/evaluation/simulator/_adversarial_simulator.py +85 -60
- azure/ai/evaluation/simulator/_conversation/__init__.py +13 -12
- azure/ai/evaluation/simulator/_conversation/_conversation.py +4 -4
- azure/ai/evaluation/simulator/_direct_attack_simulator.py +24 -66
- azure/ai/evaluation/simulator/_helpers/_experimental.py +20 -9
- azure/ai/evaluation/simulator/_helpers/_simulator_data_classes.py +4 -4
- azure/ai/evaluation/simulator/_indirect_attack_simulator.py +22 -64
- azure/ai/evaluation/simulator/_model_tools/_identity_manager.py +67 -21
- azure/ai/evaluation/simulator/_model_tools/_proxy_completion_model.py +28 -11
- azure/ai/evaluation/simulator/_model_tools/_template_handler.py +68 -24
- azure/ai/evaluation/simulator/_model_tools/models.py +10 -10
- azure/ai/evaluation/simulator/_prompty/task_query_response.prompty +2 -6
- azure/ai/evaluation/simulator/_prompty/task_simulate.prompty +0 -4
- azure/ai/evaluation/simulator/_simulator.py +127 -117
- azure/ai/evaluation/simulator/_tracing.py +4 -4
- {azure_ai_evaluation-1.0.0b2.dist-info → azure_ai_evaluation-1.0.0b4.dist-info}/METADATA +129 -43
- azure_ai_evaluation-1.0.0b4.dist-info/NOTICE.txt +50 -0
- azure_ai_evaluation-1.0.0b4.dist-info/RECORD +106 -0
- azure/ai/evaluation/_evaluators/_chat/_chat.py +0 -357
- azure/ai/evaluation/_evaluators/_content_safety/_content_safety_base.py +0 -65
- azure/ai/evaluation/_evaluators/_protected_materials/__init__.py +0 -5
- azure/ai/evaluation/_evaluators/_protected_materials/_protected_materials.py +0 -104
- azure_ai_evaluation-1.0.0b2.dist-info/RECORD +0 -99
- {azure_ai_evaluation-1.0.0b2.dist-info → azure_ai_evaluation-1.0.0b4.dist-info}/WHEEL +0 -0
- {azure_ai_evaluation-1.0.0b2.dist-info → azure_ai_evaluation-1.0.0b4.dist-info}/top_level.txt +0 -0
|
@@ -4,35 +4,48 @@
|
|
|
4
4
|
import inspect
|
|
5
5
|
import os
|
|
6
6
|
import re
|
|
7
|
-
from typing import Any, Callable, Dict, List, Optional, Set, Tuple,
|
|
7
|
+
from typing import Any, Callable, Dict, List, Optional, Set, Tuple, TypedDict, TypeVar, Union
|
|
8
|
+
import json
|
|
8
9
|
|
|
9
|
-
import numpy as np
|
|
10
10
|
import pandas as pd
|
|
11
11
|
from promptflow._sdk._constants import LINE_NUMBER
|
|
12
12
|
from promptflow.client import PFClient
|
|
13
|
+
from promptflow.entities import Run
|
|
14
|
+
from promptflow._sdk._errors import MissingAzurePackage
|
|
13
15
|
|
|
16
|
+
from azure.ai.evaluation._common.math import list_sum
|
|
14
17
|
from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
|
|
15
18
|
|
|
16
19
|
from .._constants import (
|
|
17
20
|
CONTENT_SAFETY_DEFECT_RATE_THRESHOLD_DEFAULT,
|
|
18
21
|
EvaluationMetrics,
|
|
22
|
+
EvaluationRunProperties,
|
|
19
23
|
Prefixes,
|
|
20
24
|
_InternalEvaluationMetrics,
|
|
21
25
|
)
|
|
22
|
-
from .._model_configurations import AzureAIProject
|
|
26
|
+
from .._model_configurations import AzureAIProject, EvaluatorConfig
|
|
23
27
|
from .._user_agent import USER_AGENT
|
|
24
28
|
from ._batch_run_client import BatchRunContext, CodeClient, ProxyClient
|
|
25
29
|
from ._utils import (
|
|
30
|
+
EvaluateResult,
|
|
26
31
|
_apply_column_mapping,
|
|
27
32
|
_log_metrics_and_instance_results,
|
|
28
33
|
_trace_destination_from_project_scope,
|
|
29
34
|
_write_output,
|
|
30
35
|
)
|
|
31
36
|
|
|
37
|
+
TClient = TypeVar("TClient", ProxyClient, CodeClient)
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
class __EvaluatorInfo(TypedDict):
|
|
41
|
+
result: pd.DataFrame
|
|
42
|
+
metrics: Dict[str, Any]
|
|
43
|
+
run_summary: Dict[str, Any]
|
|
44
|
+
|
|
32
45
|
|
|
33
46
|
# pylint: disable=line-too-long
|
|
34
47
|
def _aggregate_content_safety_metrics(
|
|
35
|
-
df: pd.DataFrame, evaluators: Dict[str,
|
|
48
|
+
df: pd.DataFrame, evaluators: Dict[str, Callable]
|
|
36
49
|
) -> Tuple[List[str], Dict[str, float]]:
|
|
37
50
|
"""Find and aggregate defect rates for content safety metrics. Returns both a list
|
|
38
51
|
of columns that were used to calculate defect rates and the defect rates themselves.
|
|
@@ -73,7 +86,7 @@ def _aggregate_content_safety_metrics(
|
|
|
73
86
|
defect_rate_name = col.replace("_score", "_defect_rate")
|
|
74
87
|
col_with_numeric_values = pd.to_numeric(content_safety_df[col], errors="coerce")
|
|
75
88
|
defect_rates[defect_rate_name] = round(
|
|
76
|
-
|
|
89
|
+
list_sum(col_with_numeric_values >= CONTENT_SAFETY_DEFECT_RATE_THRESHOLD_DEFAULT)
|
|
77
90
|
/ col_with_numeric_values.count(),
|
|
78
91
|
2,
|
|
79
92
|
)
|
|
@@ -107,13 +120,13 @@ def _aggregate_label_defect_metrics(df: pd.DataFrame) -> Tuple[List[str], Dict[s
|
|
|
107
120
|
defect_rate_name = col.replace("_label", "_defect_rate")
|
|
108
121
|
col_with_boolean_values = pd.to_numeric(label_df[col], errors="coerce")
|
|
109
122
|
defect_rates[defect_rate_name] = round(
|
|
110
|
-
|
|
123
|
+
list_sum(col_with_boolean_values) / col_with_boolean_values.count(),
|
|
111
124
|
2,
|
|
112
125
|
)
|
|
113
126
|
return label_cols, defect_rates
|
|
114
127
|
|
|
115
128
|
|
|
116
|
-
def _aggregate_metrics(df: pd.DataFrame, evaluators: Dict[str,
|
|
129
|
+
def _aggregate_metrics(df: pd.DataFrame, evaluators: Dict[str, Callable]) -> Dict[str, float]:
|
|
117
130
|
"""Aggregate metrics from the evaluation results.
|
|
118
131
|
On top of naively calculating the mean of most metrics, this function also identifies certain columns
|
|
119
132
|
that represent defect rates and renames them accordingly. Other columns in the dataframe are dropped.
|
|
@@ -122,7 +135,7 @@ def _aggregate_metrics(df: pd.DataFrame, evaluators: Dict[str, Type]) -> Dict[st
|
|
|
122
135
|
:param df: The dataframe of evaluation results.
|
|
123
136
|
:type df: ~pandas.DataFrame
|
|
124
137
|
:param evaluators: A dictionary mapping of strings to evaluator classes.
|
|
125
|
-
:type evaluators: Dict[str,
|
|
138
|
+
:type evaluators: Dict[str, Callable]
|
|
126
139
|
:return: The aggregated metrics.
|
|
127
140
|
:rtype: Dict[str, float]
|
|
128
141
|
"""
|
|
@@ -158,6 +171,12 @@ def _validate_input_data_for_evaluator(evaluator, evaluator_name, df_data, is_ta
|
|
|
158
171
|
]
|
|
159
172
|
|
|
160
173
|
missing_inputs = [col for col in required_inputs if col not in df_data.columns]
|
|
174
|
+
if missing_inputs and "conversation" in required_inputs:
|
|
175
|
+
non_conversation_inputs = [val for val in required_inputs if val != "conversation"]
|
|
176
|
+
if len(missing_inputs) == len(non_conversation_inputs) and [
|
|
177
|
+
input in non_conversation_inputs for input in missing_inputs
|
|
178
|
+
]:
|
|
179
|
+
missing_inputs = []
|
|
161
180
|
if missing_inputs:
|
|
162
181
|
if not is_target_fn:
|
|
163
182
|
msg = f"Missing required inputs for evaluator {evaluator_name} : {missing_inputs}."
|
|
@@ -271,9 +290,9 @@ def _validate_and_load_data(target, data, evaluators, output_path, azure_ai_proj
|
|
|
271
290
|
|
|
272
291
|
def _validate_columns(
|
|
273
292
|
df: pd.DataFrame,
|
|
274
|
-
evaluators: Dict[str,
|
|
293
|
+
evaluators: Dict[str, Callable],
|
|
275
294
|
target: Optional[Callable],
|
|
276
|
-
|
|
295
|
+
column_mapping: Dict[str, Dict[str, str]],
|
|
277
296
|
) -> None:
|
|
278
297
|
"""
|
|
279
298
|
Check that all columns needed by evaluator or target function are present.
|
|
@@ -281,11 +300,11 @@ def _validate_columns(
|
|
|
281
300
|
:param df: The data frame to be validated.
|
|
282
301
|
:type df: pd.DataFrame
|
|
283
302
|
:param evaluators: The dictionary of evaluators.
|
|
284
|
-
:type evaluators: Dict[str,
|
|
303
|
+
:type evaluators: Dict[str, Callable]
|
|
285
304
|
:param target: The callable to be applied to data set.
|
|
286
305
|
:type target: Optional[Callable]
|
|
287
|
-
:param
|
|
288
|
-
:type
|
|
306
|
+
:param column_mapping: Dictionary mapping evaluator name to evaluator column mapping
|
|
307
|
+
:type column_mapping: Dict[str, Dict[str, str]]
|
|
289
308
|
:raises EvaluationException: If column starts from "__outputs." while target is defined.
|
|
290
309
|
"""
|
|
291
310
|
if target:
|
|
@@ -306,7 +325,7 @@ def _validate_columns(
|
|
|
306
325
|
else:
|
|
307
326
|
for evaluator_name, evaluator in evaluators.items():
|
|
308
327
|
# Apply column mapping
|
|
309
|
-
mapping_config =
|
|
328
|
+
mapping_config = column_mapping.get(evaluator_name, column_mapping.get("default", None))
|
|
310
329
|
new_df = _apply_column_mapping(df, mapping_config)
|
|
311
330
|
|
|
312
331
|
# Validate input data for evaluator
|
|
@@ -320,7 +339,7 @@ def _apply_target_to_data(
|
|
|
320
339
|
initial_data: pd.DataFrame,
|
|
321
340
|
evaluation_name: Optional[str] = None,
|
|
322
341
|
_run_name: Optional[str] = None,
|
|
323
|
-
) -> Tuple[pd.DataFrame, Set[str]]:
|
|
342
|
+
) -> Tuple[pd.DataFrame, Set[str], Run]:
|
|
324
343
|
"""
|
|
325
344
|
Apply the target function to the data set and return updated data and generated columns.
|
|
326
345
|
|
|
@@ -342,15 +361,15 @@ def _apply_target_to_data(
|
|
|
342
361
|
# We are manually creating the temporary directory for the flow
|
|
343
362
|
# because the way tempdir remove temporary directories will
|
|
344
363
|
# hang the debugger, because promptflow will keep flow directory.
|
|
345
|
-
run = pf_client.run(
|
|
364
|
+
run: Run = pf_client.run(
|
|
346
365
|
flow=target,
|
|
347
366
|
display_name=evaluation_name,
|
|
348
367
|
data=data,
|
|
349
|
-
properties={
|
|
368
|
+
properties={EvaluationRunProperties.RUN_TYPE: "eval_run", "isEvaluatorRun": "true"},
|
|
350
369
|
stream=True,
|
|
351
370
|
name=_run_name,
|
|
352
371
|
)
|
|
353
|
-
target_output = pf_client.runs.get_details(run, all_results=True)
|
|
372
|
+
target_output: pd.DataFrame = pf_client.runs.get_details(run, all_results=True)
|
|
354
373
|
# Remove input and output prefix
|
|
355
374
|
generated_columns = {
|
|
356
375
|
col[len(Prefixes.OUTPUTS) :] for col in target_output.columns if col.startswith(Prefixes.OUTPUTS)
|
|
@@ -372,28 +391,30 @@ def _apply_target_to_data(
|
|
|
372
391
|
return target_output, generated_columns, run
|
|
373
392
|
|
|
374
393
|
|
|
375
|
-
def
|
|
376
|
-
|
|
394
|
+
def _process_column_mappings(
|
|
395
|
+
column_mapping: Dict[str, Optional[Dict[str, str]]],
|
|
396
|
+
) -> Dict[str, Dict[str, str]]:
|
|
397
|
+
"""Process column_mapping to replace ${target.} with ${data.}
|
|
377
398
|
|
|
378
|
-
:param
|
|
379
|
-
:type
|
|
399
|
+
:param column_mapping: The configuration for evaluators.
|
|
400
|
+
:type column_mapping: Dict[str, Optional[Dict[str, str]]]
|
|
380
401
|
:return: The processed configuration.
|
|
381
402
|
:rtype: Dict[str, Dict[str, str]]
|
|
382
403
|
"""
|
|
383
404
|
|
|
384
|
-
processed_config = {}
|
|
405
|
+
processed_config: Dict[str, Dict[str, str]] = {}
|
|
385
406
|
|
|
386
407
|
unexpected_references = re.compile(r"\${(?!target\.|data\.).+?}")
|
|
387
408
|
|
|
388
|
-
if
|
|
389
|
-
for evaluator, mapping_config in
|
|
409
|
+
if column_mapping:
|
|
410
|
+
for evaluator, mapping_config in column_mapping.items():
|
|
390
411
|
if isinstance(mapping_config, dict):
|
|
391
412
|
processed_config[evaluator] = {}
|
|
392
413
|
|
|
393
414
|
for map_to_key, map_value in mapping_config.items():
|
|
394
415
|
# Check if there's any unexpected reference other than ${target.} or ${data.}
|
|
395
416
|
if unexpected_references.search(map_value):
|
|
396
|
-
msg = "Unexpected references detected in '
|
|
417
|
+
msg = "Unexpected references detected in 'column_mapping'. Ensure only ${target.} and ${data.} are used."
|
|
397
418
|
raise EvaluationException(
|
|
398
419
|
message=msg,
|
|
399
420
|
internal_message=msg,
|
|
@@ -439,7 +460,7 @@ def evaluate(
|
|
|
439
460
|
evaluators: Dict[str, Callable],
|
|
440
461
|
evaluation_name: Optional[str] = None,
|
|
441
462
|
target: Optional[Callable] = None,
|
|
442
|
-
evaluator_config: Optional[Dict[str,
|
|
463
|
+
evaluator_config: Optional[Dict[str, EvaluatorConfig]] = None,
|
|
443
464
|
azure_ai_project: Optional[AzureAIProject] = None,
|
|
444
465
|
output_path: Optional[str] = None,
|
|
445
466
|
**kwargs,
|
|
@@ -458,10 +479,10 @@ def evaluate(
|
|
|
458
479
|
:keyword target: Target to be evaluated. `target` and `data` both cannot be None
|
|
459
480
|
:paramtype target: Optional[Callable]
|
|
460
481
|
:keyword evaluator_config: Configuration for evaluators. The configuration should be a dictionary with evaluator
|
|
461
|
-
names as keys and a
|
|
462
|
-
keys as the column names in the evaluator input and values as the column names in the
|
|
463
|
-
generated by target.
|
|
464
|
-
:paramtype evaluator_config: Optional[Dict[str,
|
|
482
|
+
names as keys and a values that are dictionaries containing the column mappings. The column mappings should
|
|
483
|
+
be a dictionary with keys as the column names in the evaluator input and values as the column names in the
|
|
484
|
+
input data or data generated by target.
|
|
485
|
+
:paramtype evaluator_config: Optional[Dict[str, ~azure.ai.evaluation.EvaluatorConfig]]
|
|
465
486
|
:keyword output_path: The local folder or file path to save evaluation results to if set. If folder path is provided
|
|
466
487
|
the results will be saved to a file named `evaluation_results.json` in the folder.
|
|
467
488
|
:paramtype output_path: Optional[str]
|
|
@@ -482,7 +503,7 @@ def evaluate(
|
|
|
482
503
|
model_config = {
|
|
483
504
|
"azure_endpoint": os.environ.get("AZURE_OPENAI_ENDPOINT"),
|
|
484
505
|
"api_key": os.environ.get("AZURE_OPENAI_KEY"),
|
|
485
|
-
"azure_deployment": os.environ.get("AZURE_OPENAI_DEPLOYMENT")
|
|
506
|
+
"azure_deployment": os.environ.get("AZURE_OPENAI_DEPLOYMENT"),
|
|
486
507
|
}
|
|
487
508
|
|
|
488
509
|
coherence_eval = CoherenceEvaluator(model_config=model_config)
|
|
@@ -497,15 +518,19 @@ def evaluate(
|
|
|
497
518
|
},
|
|
498
519
|
evaluator_config={
|
|
499
520
|
"coherence": {
|
|
500
|
-
"
|
|
501
|
-
|
|
521
|
+
"column_mapping": {
|
|
522
|
+
"response": "${data.response}",
|
|
523
|
+
"query": "${data.query}",
|
|
524
|
+
},
|
|
502
525
|
},
|
|
503
526
|
"relevance": {
|
|
504
|
-
"
|
|
505
|
-
|
|
506
|
-
|
|
507
|
-
|
|
508
|
-
|
|
527
|
+
"column_mapping": {
|
|
528
|
+
"response": "${data.response}",
|
|
529
|
+
"context": "${data.context}",
|
|
530
|
+
"query": "${data.query}",
|
|
531
|
+
},
|
|
532
|
+
},
|
|
533
|
+
},
|
|
509
534
|
)
|
|
510
535
|
|
|
511
536
|
"""
|
|
@@ -544,41 +569,74 @@ def evaluate(
|
|
|
544
569
|
raise e
|
|
545
570
|
|
|
546
571
|
|
|
547
|
-
def
|
|
572
|
+
def _print_summary(per_evaluator_results: Dict[str, Any]) -> None:
|
|
573
|
+
# Extract evaluators with a non-empty "run_summary"
|
|
574
|
+
output_dict = {
|
|
575
|
+
name: result["run_summary"] for name, result in per_evaluator_results.items() if result.get("run_summary")
|
|
576
|
+
}
|
|
577
|
+
|
|
578
|
+
if output_dict:
|
|
579
|
+
print("======= Combined Run Summary (Per Evaluator) =======\n")
|
|
580
|
+
print(json.dumps(output_dict, indent=4))
|
|
581
|
+
print("\n====================================================")
|
|
582
|
+
|
|
583
|
+
|
|
584
|
+
def _evaluate( # pylint: disable=too-many-locals,too-many-statements
|
|
548
585
|
*,
|
|
586
|
+
evaluators: Dict[str, Callable],
|
|
549
587
|
evaluation_name: Optional[str] = None,
|
|
550
588
|
target: Optional[Callable] = None,
|
|
551
|
-
data:
|
|
552
|
-
|
|
553
|
-
evaluator_config: Optional[Dict[str, Dict[str, str]]] = None,
|
|
589
|
+
data: str,
|
|
590
|
+
evaluator_config: Optional[Dict[str, EvaluatorConfig]] = None,
|
|
554
591
|
azure_ai_project: Optional[AzureAIProject] = None,
|
|
555
592
|
output_path: Optional[str] = None,
|
|
556
593
|
**kwargs,
|
|
557
|
-
):
|
|
594
|
+
) -> EvaluateResult:
|
|
558
595
|
input_data_df = _validate_and_load_data(target, data, evaluators, output_path, azure_ai_project, evaluation_name)
|
|
559
596
|
|
|
560
597
|
# Process evaluator config to replace ${target.} with ${data.}
|
|
561
598
|
if evaluator_config is None:
|
|
562
599
|
evaluator_config = {}
|
|
563
|
-
|
|
564
|
-
|
|
600
|
+
# extract column mapping dicts into dictionary mapping evaluator name to column mapping
|
|
601
|
+
column_mapping = _process_column_mappings(
|
|
602
|
+
{
|
|
603
|
+
evaluator_name: evaluator_configuration.get("column_mapping", None)
|
|
604
|
+
for evaluator_name, evaluator_configuration in evaluator_config.items()
|
|
605
|
+
}
|
|
606
|
+
)
|
|
607
|
+
_validate_columns(input_data_df, evaluators, target, column_mapping)
|
|
565
608
|
|
|
566
609
|
# Target Run
|
|
567
|
-
|
|
568
|
-
|
|
569
|
-
|
|
570
|
-
|
|
571
|
-
|
|
572
|
-
|
|
610
|
+
try:
|
|
611
|
+
pf_client = PFClient(
|
|
612
|
+
config=(
|
|
613
|
+
{"trace.destination": _trace_destination_from_project_scope(azure_ai_project)}
|
|
614
|
+
if azure_ai_project
|
|
615
|
+
else None
|
|
616
|
+
),
|
|
617
|
+
user_agent=USER_AGENT,
|
|
618
|
+
)
|
|
619
|
+
# pylint: disable=raise-missing-from
|
|
620
|
+
except MissingAzurePackage:
|
|
621
|
+
msg = (
|
|
622
|
+
"The required packages for remote tracking are missing.\n"
|
|
623
|
+
'To resolve this, please install them by running "pip install azure-ai-evaluation[remote]".'
|
|
624
|
+
)
|
|
573
625
|
|
|
574
|
-
|
|
575
|
-
|
|
576
|
-
|
|
626
|
+
raise EvaluationException(
|
|
627
|
+
message=msg,
|
|
628
|
+
target=ErrorTarget.EVALUATE,
|
|
629
|
+
category=ErrorCategory.MISSING_PACKAGE,
|
|
630
|
+
blame=ErrorBlame.USER_ERROR,
|
|
631
|
+
)
|
|
632
|
+
|
|
633
|
+
trace_destination: Optional[str] = pf_client._config.get_trace_destination() # pylint: disable=protected-access
|
|
634
|
+
target_run: Optional[Run] = None
|
|
577
635
|
|
|
578
636
|
# Create default configuration for evaluators that directly maps
|
|
579
637
|
# input data names to keyword inputs of the same name in the evaluators.
|
|
580
|
-
|
|
581
|
-
|
|
638
|
+
column_mapping = column_mapping or {}
|
|
639
|
+
column_mapping.setdefault("default", {})
|
|
582
640
|
|
|
583
641
|
# If target is set, apply 1-1 column mapping from target outputs to evaluator inputs
|
|
584
642
|
if data is not None and target is not None:
|
|
@@ -586,21 +644,21 @@ def _evaluate( # pylint: disable=too-many-locals
|
|
|
586
644
|
target, data, pf_client, input_data_df, evaluation_name, _run_name=kwargs.get("_run_name")
|
|
587
645
|
)
|
|
588
646
|
|
|
589
|
-
for evaluator_name, mapping in
|
|
647
|
+
for evaluator_name, mapping in column_mapping.items():
|
|
590
648
|
mapped_to_values = set(mapping.values())
|
|
591
649
|
for col in target_generated_columns:
|
|
592
650
|
# If user defined mapping differently, do not change it.
|
|
593
651
|
# If it was mapped to target, we have already changed it
|
|
594
|
-
# in
|
|
652
|
+
# in _process_column_mappings
|
|
595
653
|
run_output = f"${{run.outputs.{col}}}"
|
|
596
654
|
# We will add our mapping only if
|
|
597
655
|
# customer did not mapped target output.
|
|
598
656
|
if col not in mapping and run_output not in mapped_to_values:
|
|
599
|
-
|
|
657
|
+
column_mapping[evaluator_name][col] = run_output # pylint: disable=unnecessary-dict-index-lookup
|
|
600
658
|
|
|
601
659
|
# After we have generated all columns we can check if we have
|
|
602
660
|
# everything we need for evaluators.
|
|
603
|
-
_validate_columns(input_data_df, evaluators, target=None,
|
|
661
|
+
_validate_columns(input_data_df, evaluators, target=None, column_mapping=column_mapping)
|
|
604
662
|
|
|
605
663
|
# Apply 1-1 mapping from input data to evaluator inputs, excluding values already assigned
|
|
606
664
|
# via target mapping.
|
|
@@ -610,44 +668,56 @@ def _evaluate( # pylint: disable=too-many-locals
|
|
|
610
668
|
for col in input_data_df.columns:
|
|
611
669
|
# Ignore columns added by target mapping. These are formatted as "__outputs.<column_name>"
|
|
612
670
|
# Also ignore columns that are already in config, since they've been covered by target mapping.
|
|
613
|
-
if not col.startswith(Prefixes.TSG_OUTPUTS) and col not in
|
|
614
|
-
|
|
671
|
+
if not col.startswith(Prefixes.TSG_OUTPUTS) and col not in column_mapping["default"].keys():
|
|
672
|
+
column_mapping["default"][col] = f"${{data.{col}}}"
|
|
673
|
+
|
|
674
|
+
def eval_batch_run(
|
|
675
|
+
batch_run_client: TClient, *, data=Union[str, os.PathLike, pd.DataFrame]
|
|
676
|
+
) -> Dict[str, __EvaluatorInfo]:
|
|
677
|
+
with BatchRunContext(batch_run_client):
|
|
678
|
+
runs = {
|
|
679
|
+
evaluator_name: batch_run_client.run(
|
|
680
|
+
flow=evaluator,
|
|
681
|
+
run=target_run,
|
|
682
|
+
evaluator_name=evaluator_name,
|
|
683
|
+
column_mapping=column_mapping.get(evaluator_name, column_mapping.get("default", None)),
|
|
684
|
+
data=data,
|
|
685
|
+
stream=True,
|
|
686
|
+
name=kwargs.get("_run_name"),
|
|
687
|
+
)
|
|
688
|
+
for evaluator_name, evaluator in evaluators.items()
|
|
689
|
+
}
|
|
690
|
+
|
|
691
|
+
# get_details needs to be called within BatchRunContext scope in order to have user agent populated
|
|
692
|
+
return {
|
|
693
|
+
evaluator_name: {
|
|
694
|
+
"result": batch_run_client.get_details(run, all_results=True),
|
|
695
|
+
"metrics": batch_run_client.get_metrics(run),
|
|
696
|
+
"run_summary": batch_run_client.get_run_summary(run),
|
|
697
|
+
}
|
|
698
|
+
for evaluator_name, run in runs.items()
|
|
699
|
+
}
|
|
700
|
+
|
|
615
701
|
# Batch Run
|
|
616
|
-
evaluators_info = {}
|
|
617
702
|
use_pf_client = kwargs.get("_use_pf_client", True)
|
|
618
703
|
if use_pf_client:
|
|
619
|
-
batch_run_client = ProxyClient(pf_client)
|
|
620
|
-
|
|
621
704
|
# Ensure the absolute path is passed to pf.run, as relative path doesn't work with
|
|
622
705
|
# multiple evaluators. If the path is already absolute, abspath will return the original path.
|
|
623
706
|
data = os.path.abspath(data)
|
|
707
|
+
|
|
708
|
+
# A user reported intermittent errors when PFClient uploads evaluation runs to the cloud.
|
|
709
|
+
# The root cause is still unclear, but it seems related to a conflict between the async run uploader
|
|
710
|
+
# and the async batch run. As a quick mitigation, use a PFClient without a trace destination for batch runs.
|
|
711
|
+
per_evaluator_results = eval_batch_run(ProxyClient(PFClient(user_agent=USER_AGENT)), data=data)
|
|
624
712
|
else:
|
|
625
|
-
batch_run_client = CodeClient()
|
|
626
713
|
data = input_data_df
|
|
627
|
-
|
|
628
|
-
with BatchRunContext(batch_run_client):
|
|
629
|
-
for evaluator_name, evaluator in evaluators.items():
|
|
630
|
-
evaluators_info[evaluator_name] = {}
|
|
631
|
-
evaluators_info[evaluator_name]["run"] = batch_run_client.run(
|
|
632
|
-
flow=evaluator,
|
|
633
|
-
run=target_run,
|
|
634
|
-
evaluator_name=evaluator_name,
|
|
635
|
-
column_mapping=evaluator_config.get(evaluator_name, evaluator_config.get("default", None)),
|
|
636
|
-
data=data,
|
|
637
|
-
stream=True,
|
|
638
|
-
name=kwargs.get("_run_name"),
|
|
639
|
-
)
|
|
640
|
-
|
|
641
|
-
# get_details needs to be called within BatchRunContext scope in order to have user agent populated
|
|
642
|
-
for evaluator_name, evaluator_info in evaluators_info.items():
|
|
643
|
-
evaluator_info["result"] = batch_run_client.get_details(evaluator_info["run"], all_results=True)
|
|
644
|
-
evaluator_info["metrics"] = batch_run_client.get_metrics(evaluator_info["run"])
|
|
714
|
+
per_evaluator_results = eval_batch_run(CodeClient(), data=input_data_df)
|
|
645
715
|
|
|
646
716
|
# Concatenate all results
|
|
647
717
|
evaluators_result_df = None
|
|
648
718
|
evaluators_metric = {}
|
|
649
|
-
for evaluator_name,
|
|
650
|
-
evaluator_result_df =
|
|
719
|
+
for evaluator_name, evaluator_result in per_evaluator_results.items():
|
|
720
|
+
evaluator_result_df = evaluator_result["result"]
|
|
651
721
|
|
|
652
722
|
# drop input columns
|
|
653
723
|
evaluator_result_df = evaluator_result_df.drop(
|
|
@@ -670,7 +740,7 @@ def _evaluate( # pylint: disable=too-many-locals
|
|
|
670
740
|
else evaluator_result_df
|
|
671
741
|
)
|
|
672
742
|
|
|
673
|
-
evaluators_metric.update({f"{evaluator_name}.{k}": v for k, v in
|
|
743
|
+
evaluators_metric.update({f"{evaluator_name}.{k}": v for k, v in evaluator_result["metrics"].items()})
|
|
674
744
|
|
|
675
745
|
# Rename columns, generated by target function to outputs instead of inputs.
|
|
676
746
|
# If target generates columns, already present in the input data, these columns
|
|
@@ -688,9 +758,11 @@ def _evaluate( # pylint: disable=too-many-locals
|
|
|
688
758
|
evaluation_name,
|
|
689
759
|
)
|
|
690
760
|
|
|
691
|
-
result = {"rows": result_df.to_dict("records"), "metrics": metrics, "studio_url": studio_url}
|
|
761
|
+
result: EvaluateResult = {"rows": result_df.to_dict("records"), "metrics": metrics, "studio_url": studio_url}
|
|
692
762
|
|
|
693
763
|
if output_path:
|
|
694
764
|
_write_output(output_path, result)
|
|
695
765
|
|
|
766
|
+
_print_summary(per_evaluator_results)
|
|
767
|
+
|
|
696
768
|
return result
|
|
@@ -6,7 +6,7 @@ import functools
|
|
|
6
6
|
import inspect
|
|
7
7
|
import json
|
|
8
8
|
import logging
|
|
9
|
-
from typing import Callable, Dict,
|
|
9
|
+
from typing import Callable, Dict, Literal, Optional, Union, cast
|
|
10
10
|
|
|
11
11
|
import pandas as pd
|
|
12
12
|
from promptflow._sdk.entities._flows import FlexFlow as flex_flow
|
|
@@ -16,31 +16,30 @@ from promptflow.client import PFClient
|
|
|
16
16
|
from promptflow.core import Prompty as prompty_core
|
|
17
17
|
from typing_extensions import ParamSpec
|
|
18
18
|
|
|
19
|
+
from azure.ai.evaluation._model_configurations import AzureAIProject
|
|
20
|
+
|
|
19
21
|
from ..._user_agent import USER_AGENT
|
|
20
|
-
from .._utils import _trace_destination_from_project_scope
|
|
22
|
+
from .._utils import EvaluateResult, _trace_destination_from_project_scope
|
|
21
23
|
|
|
22
24
|
LOGGER = logging.getLogger(__name__)
|
|
23
25
|
|
|
24
26
|
P = ParamSpec("P")
|
|
25
|
-
R = TypeVar("R")
|
|
26
27
|
|
|
27
28
|
|
|
28
|
-
def _get_evaluator_type(evaluator: Dict[str, Callable]):
|
|
29
|
+
def _get_evaluator_type(evaluator: Dict[str, Callable]) -> Literal["content-safety", "built-in", "custom"]:
|
|
29
30
|
"""
|
|
30
31
|
Get evaluator type for telemetry.
|
|
31
32
|
|
|
32
33
|
:param evaluator: The evaluator object
|
|
33
34
|
:type evaluator: Dict[str, Callable]
|
|
34
35
|
:return: The evaluator type. Possible values are "built-in", "custom", and "content-safety".
|
|
35
|
-
:rtype:
|
|
36
|
+
:rtype: Literal["content-safety", "built-in", "custom"]
|
|
36
37
|
"""
|
|
37
|
-
built_in = False
|
|
38
|
-
content_safety = False
|
|
39
|
-
|
|
40
38
|
module = inspect.getmodule(evaluator)
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
39
|
+
module_name = module.__name__ if module else ""
|
|
40
|
+
|
|
41
|
+
built_in = module_name.startswith("azure.ai.evaluation._evaluators.")
|
|
42
|
+
content_safety = built_in and module_name.startswith("azure.ai.evaluation._evaluators._content_safety")
|
|
44
43
|
|
|
45
44
|
if content_safety:
|
|
46
45
|
return "content-safety"
|
|
@@ -98,22 +97,22 @@ def _get_evaluator_properties(evaluator, evaluator_name):
|
|
|
98
97
|
|
|
99
98
|
|
|
100
99
|
# cspell:ignore isna
|
|
101
|
-
def log_evaluate_activity(func: Callable[P,
|
|
100
|
+
def log_evaluate_activity(func: Callable[P, EvaluateResult]) -> Callable[P, EvaluateResult]:
|
|
102
101
|
"""Decorator to log evaluate activity
|
|
103
102
|
|
|
104
103
|
:param func: The function to be decorated
|
|
105
104
|
:type func: Callable
|
|
106
105
|
:returns: The decorated function
|
|
107
|
-
:rtype: Callable[P,
|
|
106
|
+
:rtype: Callable[P, EvaluateResult]
|
|
108
107
|
"""
|
|
109
108
|
|
|
110
109
|
@functools.wraps(func)
|
|
111
|
-
def wrapper(*args: P.args, **kwargs: P.kwargs) ->
|
|
110
|
+
def wrapper(*args: P.args, **kwargs: P.kwargs) -> EvaluateResult:
|
|
112
111
|
from promptflow._sdk._telemetry import ActivityType, log_activity
|
|
113
112
|
from promptflow._sdk._telemetry.telemetry import get_telemetry_logger
|
|
114
113
|
|
|
115
|
-
evaluators = kwargs.get("evaluators",
|
|
116
|
-
azure_ai_project = kwargs.get("azure_ai_project", None)
|
|
114
|
+
evaluators = cast(Optional[Dict[str, Callable]], kwargs.get("evaluators", {})) or {}
|
|
115
|
+
azure_ai_project = cast(Optional[AzureAIProject], kwargs.get("azure_ai_project", None))
|
|
117
116
|
|
|
118
117
|
pf_client = PFClient(
|
|
119
118
|
config=(
|
|
@@ -127,7 +126,7 @@ def log_evaluate_activity(func: Callable[P, R]) -> Callable[P, R]:
|
|
|
127
126
|
track_in_cloud = bool(pf_client._config.get_trace_destination()) # pylint: disable=protected-access
|
|
128
127
|
evaluate_target = bool(kwargs.get("target", None))
|
|
129
128
|
evaluator_config = bool(kwargs.get("evaluator_config", None))
|
|
130
|
-
custom_dimensions = {
|
|
129
|
+
custom_dimensions: Dict[str, Union[str, bool]] = {
|
|
131
130
|
"track_in_cloud": track_in_cloud,
|
|
132
131
|
"evaluate_target": evaluate_target,
|
|
133
132
|
"evaluator_config": evaluator_config,
|