azure-ai-evaluation 1.0.0b1__py3-none-any.whl → 1.0.0b3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of azure-ai-evaluation might be problematic. Click here for more details.
- azure/ai/evaluation/__init__.py +4 -4
- azure/ai/evaluation/_common/rai_service.py +4 -4
- azure/ai/evaluation/_common/utils.py +40 -25
- azure/ai/evaluation/_constants.py +13 -0
- azure/ai/evaluation/_evaluate/_batch_run_client/batch_run_context.py +2 -1
- azure/ai/evaluation/_evaluate/_batch_run_client/code_client.py +39 -17
- azure/ai/evaluation/_evaluate/_batch_run_client/proxy_client.py +23 -13
- azure/ai/evaluation/_evaluate/_eval_run.py +38 -18
- azure/ai/evaluation/_evaluate/_evaluate.py +88 -63
- azure/ai/evaluation/_evaluate/_telemetry/__init__.py +13 -8
- azure/ai/evaluation/_evaluate/_utils.py +29 -22
- azure/ai/evaluation/_evaluators/_bleu/_bleu.py +1 -1
- azure/ai/evaluation/_evaluators/_coherence/_coherence.py +34 -86
- azure/ai/evaluation/_evaluators/_coherence/coherence.prompty +0 -5
- azure/ai/evaluation/_evaluators/_common/__init__.py +13 -0
- azure/ai/evaluation/_evaluators/_common/_base_eval.py +302 -0
- azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +79 -0
- azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py +99 -0
- azure/ai/evaluation/_evaluators/_content_safety/__init__.py +0 -4
- azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py +0 -2
- azure/ai/evaluation/_evaluators/_content_safety/_content_safety_chat.py +9 -4
- azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py +18 -41
- azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py +18 -39
- azure/ai/evaluation/_evaluators/_content_safety/_sexual.py +18 -39
- azure/ai/evaluation/_evaluators/_content_safety/_violence.py +18 -39
- azure/ai/evaluation/_evaluators/_eci/_eci.py +18 -55
- azure/ai/evaluation/_evaluators/_f1_score/_f1_score.py +2 -1
- azure/ai/evaluation/_evaluators/_fluency/_fluency.py +29 -79
- azure/ai/evaluation/_evaluators/_fluency/fluency.prompty +0 -5
- azure/ai/evaluation/_evaluators/_gleu/_gleu.py +1 -1
- azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +33 -85
- azure/ai/evaluation/_evaluators/_groundedness/groundedness.prompty +0 -5
- azure/ai/evaluation/_evaluators/_meteor/_meteor.py +1 -0
- azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py +18 -65
- azure/ai/evaluation/_evaluators/_qa/_qa.py +3 -14
- azure/ai/evaluation/_evaluators/_relevance/_relevance.py +34 -88
- azure/ai/evaluation/_evaluators/_relevance/relevance.prompty +0 -5
- azure/ai/evaluation/_evaluators/{_chat → _retrieval}/__init__.py +2 -2
- azure/ai/evaluation/_evaluators/{_chat/retrieval → _retrieval}/_retrieval.py +17 -29
- azure/ai/evaluation/_evaluators/{_chat/retrieval → _retrieval}/retrieval.prompty +0 -5
- azure/ai/evaluation/_evaluators/_rouge/_rouge.py +3 -2
- azure/ai/evaluation/_evaluators/_similarity/_similarity.py +5 -18
- azure/ai/evaluation/_evaluators/_similarity/similarity.prompty +0 -5
- azure/ai/evaluation/_evaluators/_xpia/xpia.py +16 -91
- azure/ai/evaluation/_exceptions.py +0 -1
- azure/ai/evaluation/_http_utils.py +3 -3
- azure/ai/evaluation/_model_configurations.py +36 -8
- azure/ai/evaluation/_version.py +1 -1
- azure/ai/evaluation/simulator/__init__.py +1 -1
- azure/ai/evaluation/simulator/_adversarial_simulator.py +8 -6
- azure/ai/evaluation/simulator/_conversation/__init__.py +1 -1
- azure/ai/evaluation/simulator/_conversation/_conversation.py +16 -16
- azure/ai/evaluation/simulator/_direct_attack_simulator.py +6 -6
- azure/ai/evaluation/simulator/_helpers/__init__.py +3 -2
- azure/ai/evaluation/simulator/_helpers/_experimental.py +157 -0
- azure/ai/evaluation/simulator/_helpers/_simulator_data_classes.py +11 -29
- azure/ai/evaluation/simulator/_indirect_attack_simulator.py +6 -6
- azure/ai/evaluation/simulator/_model_tools/_proxy_completion_model.py +2 -3
- azure/ai/evaluation/simulator/_model_tools/_rai_client.py +18 -11
- azure/ai/evaluation/simulator/_model_tools/_template_handler.py +1 -1
- azure/ai/evaluation/simulator/_model_tools/models.py +9 -11
- azure/ai/evaluation/simulator/_prompty/__init__.py +0 -0
- azure/ai/evaluation/simulator/_prompty/task_query_response.prompty +2 -1
- azure/ai/evaluation/simulator/{simulator.py → _simulator.py} +166 -88
- azure/ai/evaluation/simulator/_tracing.py +21 -24
- azure/ai/evaluation/simulator/_utils.py +4 -1
- {azure_ai_evaluation-1.0.0b1.dist-info → azure_ai_evaluation-1.0.0b3.dist-info}/METADATA +144 -14
- azure_ai_evaluation-1.0.0b3.dist-info/RECORD +98 -0
- azure/ai/evaluation/_evaluators/_chat/_chat.py +0 -350
- azure/ai/evaluation/_evaluators/_chat/retrieval/__init__.py +0 -9
- azure/ai/evaluation/_evaluators/_content_safety/_content_safety_base.py +0 -66
- azure/ai/evaluation/_evaluators/_protected_materials/__init__.py +0 -5
- azure/ai/evaluation/_evaluators/_protected_materials/_protected_materials.py +0 -104
- azure_ai_evaluation-1.0.0b1.dist-info/RECORD +0 -97
- {azure_ai_evaluation-1.0.0b1.dist-info → azure_ai_evaluation-1.0.0b3.dist-info}/WHEEL +0 -0
- {azure_ai_evaluation-1.0.0b1.dist-info → azure_ai_evaluation-1.0.0b3.dist-info}/top_level.txt +0 -0
|
@@ -8,27 +8,26 @@ from typing import Any, Callable, Dict, List, Optional, Set, Tuple, Type
|
|
|
8
8
|
|
|
9
9
|
import numpy as np
|
|
10
10
|
import pandas as pd
|
|
11
|
-
|
|
12
11
|
from promptflow._sdk._constants import LINE_NUMBER
|
|
13
12
|
from promptflow.client import PFClient
|
|
14
13
|
|
|
15
|
-
from
|
|
14
|
+
from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
|
|
15
|
+
|
|
16
16
|
from .._constants import (
|
|
17
17
|
CONTENT_SAFETY_DEFECT_RATE_THRESHOLD_DEFAULT,
|
|
18
18
|
EvaluationMetrics,
|
|
19
19
|
Prefixes,
|
|
20
20
|
_InternalEvaluationMetrics,
|
|
21
21
|
)
|
|
22
|
+
from .._model_configurations import AzureAIProject, EvaluatorConfig
|
|
22
23
|
from .._user_agent import USER_AGENT
|
|
23
24
|
from ._batch_run_client import BatchRunContext, CodeClient, ProxyClient
|
|
24
|
-
from ._telemetry import log_evaluate_activity
|
|
25
25
|
from ._utils import (
|
|
26
26
|
_apply_column_mapping,
|
|
27
27
|
_log_metrics_and_instance_results,
|
|
28
28
|
_trace_destination_from_project_scope,
|
|
29
29
|
_write_output,
|
|
30
30
|
)
|
|
31
|
-
from azure.ai.evaluation._exceptions import EvaluationException, ErrorBlame, ErrorCategory, ErrorTarget
|
|
32
31
|
|
|
33
32
|
|
|
34
33
|
# pylint: disable=line-too-long
|
|
@@ -159,6 +158,12 @@ def _validate_input_data_for_evaluator(evaluator, evaluator_name, df_data, is_ta
|
|
|
159
158
|
]
|
|
160
159
|
|
|
161
160
|
missing_inputs = [col for col in required_inputs if col not in df_data.columns]
|
|
161
|
+
if missing_inputs and "conversation" in required_inputs:
|
|
162
|
+
non_conversation_inputs = [val for val in required_inputs if val != "conversation"]
|
|
163
|
+
if len(missing_inputs) == len(non_conversation_inputs) and [
|
|
164
|
+
input in non_conversation_inputs for input in missing_inputs
|
|
165
|
+
]:
|
|
166
|
+
missing_inputs = []
|
|
162
167
|
if missing_inputs:
|
|
163
168
|
if not is_target_fn:
|
|
164
169
|
msg = f"Missing required inputs for evaluator {evaluator_name} : {missing_inputs}."
|
|
@@ -260,12 +265,12 @@ def _validate_and_load_data(target, data, evaluators, output_path, azure_ai_proj
|
|
|
260
265
|
initial_data_df = pd.read_json(data, lines=True)
|
|
261
266
|
except Exception as e:
|
|
262
267
|
raise EvaluationException(
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
268
|
+
message=f"Failed to load data from {data}. Confirm that it is valid jsonl data. Error: {str(e)}.",
|
|
269
|
+
internal_message="Failed to load data. Confirm that it is valid jsonl data.",
|
|
270
|
+
target=ErrorTarget.EVALUATE,
|
|
271
|
+
category=ErrorCategory.INVALID_VALUE,
|
|
272
|
+
blame=ErrorBlame.USER_ERROR,
|
|
273
|
+
) from e
|
|
269
274
|
|
|
270
275
|
return initial_data_df
|
|
271
276
|
|
|
@@ -274,7 +279,7 @@ def _validate_columns(
|
|
|
274
279
|
df: pd.DataFrame,
|
|
275
280
|
evaluators: Dict[str, Any],
|
|
276
281
|
target: Optional[Callable],
|
|
277
|
-
|
|
282
|
+
column_mapping: Dict[str, Dict[str, str]],
|
|
278
283
|
) -> None:
|
|
279
284
|
"""
|
|
280
285
|
Check that all columns needed by evaluator or target function are present.
|
|
@@ -285,8 +290,8 @@ def _validate_columns(
|
|
|
285
290
|
:type evaluators: Dict[str, Any]
|
|
286
291
|
:param target: The callable to be applied to data set.
|
|
287
292
|
:type target: Optional[Callable]
|
|
288
|
-
:param
|
|
289
|
-
:type
|
|
293
|
+
:param column_mapping: Dictionary mapping evaluator name to evaluator column mapping
|
|
294
|
+
:type column_mapping: Dict[str, Dict[str, str]]
|
|
290
295
|
:raises EvaluationException: If column starts from "__outputs." while target is defined.
|
|
291
296
|
"""
|
|
292
297
|
if target:
|
|
@@ -307,7 +312,7 @@ def _validate_columns(
|
|
|
307
312
|
else:
|
|
308
313
|
for evaluator_name, evaluator in evaluators.items():
|
|
309
314
|
# Apply column mapping
|
|
310
|
-
mapping_config =
|
|
315
|
+
mapping_config = column_mapping.get(evaluator_name, column_mapping.get("default", None))
|
|
311
316
|
new_df = _apply_column_mapping(df, mapping_config)
|
|
312
317
|
|
|
313
318
|
# Validate input data for evaluator
|
|
@@ -373,11 +378,11 @@ def _apply_target_to_data(
|
|
|
373
378
|
return target_output, generated_columns, run
|
|
374
379
|
|
|
375
380
|
|
|
376
|
-
def
|
|
377
|
-
"""Process
|
|
381
|
+
def _process_column_mappings(column_mapping: Dict[str, Dict[str, str]]) -> Dict[str, Dict[str, str]]:
|
|
382
|
+
"""Process column_mapping to replace ${target.} with ${data.}
|
|
378
383
|
|
|
379
|
-
:param
|
|
380
|
-
:type
|
|
384
|
+
:param column_mapping: The configuration for evaluators.
|
|
385
|
+
:type column_mapping: Dict[str, Dict[str, str]]
|
|
381
386
|
:return: The processed configuration.
|
|
382
387
|
:rtype: Dict[str, Dict[str, str]]
|
|
383
388
|
"""
|
|
@@ -386,15 +391,15 @@ def _process_evaluator_config(evaluator_config: Dict[str, Dict[str, str]]) -> Di
|
|
|
386
391
|
|
|
387
392
|
unexpected_references = re.compile(r"\${(?!target\.|data\.).+?}")
|
|
388
393
|
|
|
389
|
-
if
|
|
390
|
-
for evaluator, mapping_config in
|
|
394
|
+
if column_mapping:
|
|
395
|
+
for evaluator, mapping_config in column_mapping.items():
|
|
391
396
|
if isinstance(mapping_config, dict):
|
|
392
397
|
processed_config[evaluator] = {}
|
|
393
398
|
|
|
394
399
|
for map_to_key, map_value in mapping_config.items():
|
|
395
400
|
# Check if there's any unexpected reference other than ${target.} or ${data.}
|
|
396
401
|
if unexpected_references.search(map_value):
|
|
397
|
-
msg = "Unexpected references detected in '
|
|
402
|
+
msg = "Unexpected references detected in 'column_mapping'. Ensure only ${target.} and ${data.} are used."
|
|
398
403
|
raise EvaluationException(
|
|
399
404
|
message=msg,
|
|
400
405
|
internal_message=msg,
|
|
@@ -436,11 +441,11 @@ def _rename_columns_conditionally(df: pd.DataFrame) -> pd.DataFrame:
|
|
|
436
441
|
# @log_evaluate_activity
|
|
437
442
|
def evaluate(
|
|
438
443
|
*,
|
|
444
|
+
data: str,
|
|
445
|
+
evaluators: Dict[str, Callable],
|
|
439
446
|
evaluation_name: Optional[str] = None,
|
|
440
447
|
target: Optional[Callable] = None,
|
|
441
|
-
|
|
442
|
-
evaluators: Optional[Dict[str, Callable]] = None,
|
|
443
|
-
evaluator_config: Optional[Dict[str, Dict[str, str]]] = None,
|
|
448
|
+
evaluator_config: Optional[Dict[str, EvaluatorConfig]] = None,
|
|
444
449
|
azure_ai_project: Optional[AzureAIProject] = None,
|
|
445
450
|
output_path: Optional[str] = None,
|
|
446
451
|
**kwargs,
|
|
@@ -448,21 +453,21 @@ def evaluate(
|
|
|
448
453
|
"""Evaluates target or data with built-in or custom evaluators. If both target and data are provided,
|
|
449
454
|
data will be run through target function and then results will be evaluated.
|
|
450
455
|
|
|
456
|
+
:keyword data: Path to the data to be evaluated or passed to target if target is set.
|
|
457
|
+
Only .jsonl format files are supported. `target` and `data` both cannot be None. Required.
|
|
458
|
+
:paramtype data: str
|
|
459
|
+
:keyword evaluators: Evaluators to be used for evaluation. It should be a dictionary with key as alias for evaluator
|
|
460
|
+
and value as the evaluator function. Required.
|
|
461
|
+
:paramtype evaluators: Dict[str, Callable]
|
|
451
462
|
:keyword evaluation_name: Display name of the evaluation.
|
|
452
463
|
:paramtype evaluation_name: Optional[str]
|
|
453
464
|
:keyword target: Target to be evaluated. `target` and `data` both cannot be None
|
|
454
465
|
:paramtype target: Optional[Callable]
|
|
455
|
-
:keyword data: Path to the data to be evaluated or passed to target if target is set.
|
|
456
|
-
Only .jsonl format files are supported. `target` and `data` both cannot be None
|
|
457
|
-
:paramtype data: Optional[str]
|
|
458
|
-
:keyword evaluators: Evaluators to be used for evaluation. It should be a dictionary with key as alias for evaluator
|
|
459
|
-
and value as the evaluator function.
|
|
460
|
-
:paramtype evaluators: Optional[Dict[str, Callable]
|
|
461
466
|
:keyword evaluator_config: Configuration for evaluators. The configuration should be a dictionary with evaluator
|
|
462
|
-
names as keys and a
|
|
463
|
-
keys as the column names in the evaluator input and values as the column names in the
|
|
464
|
-
generated by target.
|
|
465
|
-
:paramtype evaluator_config: Optional[Dict[str,
|
|
467
|
+
names as keys and a values that are dictionaries containing the column mappings. The column mappings should
|
|
468
|
+
be a dictionary with keys as the column names in the evaluator input and values as the column names in the
|
|
469
|
+
input data or data generated by target.
|
|
470
|
+
:paramtype evaluator_config: Optional[Dict[str, ~azure.ai.evaluation.EvaluatorConfig]]
|
|
466
471
|
:keyword output_path: The local folder or file path to save evaluation results to if set. If folder path is provided
|
|
467
472
|
the results will be saved to a file named `evaluation_results.json` in the folder.
|
|
468
473
|
:paramtype output_path: Optional[str]
|
|
@@ -483,7 +488,7 @@ def evaluate(
|
|
|
483
488
|
model_config = {
|
|
484
489
|
"azure_endpoint": os.environ.get("AZURE_OPENAI_ENDPOINT"),
|
|
485
490
|
"api_key": os.environ.get("AZURE_OPENAI_KEY"),
|
|
486
|
-
"azure_deployment": os.environ.get("AZURE_OPENAI_DEPLOYMENT")
|
|
491
|
+
"azure_deployment": os.environ.get("AZURE_OPENAI_DEPLOYMENT"),
|
|
487
492
|
}
|
|
488
493
|
|
|
489
494
|
coherence_eval = CoherenceEvaluator(model_config=model_config)
|
|
@@ -498,15 +503,19 @@ def evaluate(
|
|
|
498
503
|
},
|
|
499
504
|
evaluator_config={
|
|
500
505
|
"coherence": {
|
|
501
|
-
"
|
|
502
|
-
|
|
506
|
+
"column_mapping": {
|
|
507
|
+
"response": "${data.response}",
|
|
508
|
+
"query": "${data.query}",
|
|
509
|
+
},
|
|
503
510
|
},
|
|
504
511
|
"relevance": {
|
|
505
|
-
"
|
|
506
|
-
|
|
507
|
-
|
|
508
|
-
|
|
509
|
-
|
|
512
|
+
"column_mapping": {
|
|
513
|
+
"response": "${data.response}",
|
|
514
|
+
"context": "${data.context}",
|
|
515
|
+
"query": "${data.query}",
|
|
516
|
+
},
|
|
517
|
+
},
|
|
518
|
+
},
|
|
510
519
|
)
|
|
511
520
|
|
|
512
521
|
"""
|
|
@@ -545,13 +554,13 @@ def evaluate(
|
|
|
545
554
|
raise e
|
|
546
555
|
|
|
547
556
|
|
|
548
|
-
def _evaluate( # pylint: disable=too-many-locals
|
|
557
|
+
def _evaluate( # pylint: disable=too-many-locals,too-many-statements
|
|
549
558
|
*,
|
|
550
559
|
evaluation_name: Optional[str] = None,
|
|
551
560
|
target: Optional[Callable] = None,
|
|
552
561
|
data: Optional[str] = None,
|
|
553
562
|
evaluators: Optional[Dict[str, Callable]] = None,
|
|
554
|
-
evaluator_config: Optional[Dict[str,
|
|
563
|
+
evaluator_config: Optional[Dict[str, EvaluatorConfig]] = None,
|
|
555
564
|
azure_ai_project: Optional[AzureAIProject] = None,
|
|
556
565
|
output_path: Optional[str] = None,
|
|
557
566
|
**kwargs,
|
|
@@ -561,8 +570,13 @@ def _evaluate( # pylint: disable=too-many-locals
|
|
|
561
570
|
# Process evaluator config to replace ${target.} with ${data.}
|
|
562
571
|
if evaluator_config is None:
|
|
563
572
|
evaluator_config = {}
|
|
564
|
-
|
|
565
|
-
|
|
573
|
+
# extract column mapping dicts into dictionary mapping evaluator name to column mapping
|
|
574
|
+
column_mapping = {
|
|
575
|
+
evaluator_name: evaluator_configuration.get("column_mapping", None)
|
|
576
|
+
for evaluator_name, evaluator_configuration in evaluator_config.items()
|
|
577
|
+
}
|
|
578
|
+
column_mapping = _process_column_mappings(column_mapping)
|
|
579
|
+
_validate_columns(input_data_df, evaluators, target, column_mapping)
|
|
566
580
|
|
|
567
581
|
# Target Run
|
|
568
582
|
pf_client = PFClient(
|
|
@@ -572,43 +586,55 @@ def _evaluate( # pylint: disable=too-many-locals
|
|
|
572
586
|
user_agent=USER_AGENT,
|
|
573
587
|
)
|
|
574
588
|
|
|
575
|
-
trace_destination = pf_client._config.get_trace_destination()
|
|
576
|
-
|
|
589
|
+
trace_destination = pf_client._config.get_trace_destination() # pylint: disable=protected-access
|
|
577
590
|
target_run = None
|
|
578
|
-
|
|
579
591
|
target_generated_columns = set()
|
|
592
|
+
|
|
593
|
+
# Create default configuration for evaluators that directly maps
|
|
594
|
+
# input data names to keyword inputs of the same name in the evaluators.
|
|
595
|
+
column_mapping = column_mapping or {}
|
|
596
|
+
column_mapping.setdefault("default", {})
|
|
597
|
+
|
|
598
|
+
# If target is set, apply 1-1 column mapping from target outputs to evaluator inputs
|
|
580
599
|
if data is not None and target is not None:
|
|
581
600
|
input_data_df, target_generated_columns, target_run = _apply_target_to_data(
|
|
582
601
|
target, data, pf_client, input_data_df, evaluation_name, _run_name=kwargs.get("_run_name")
|
|
583
602
|
)
|
|
584
603
|
|
|
585
|
-
|
|
586
|
-
if not evaluator_config:
|
|
587
|
-
evaluator_config = {}
|
|
588
|
-
if "default" not in evaluator_config:
|
|
589
|
-
evaluator_config["default"] = {}
|
|
590
|
-
|
|
591
|
-
for evaluator_name, mapping in evaluator_config.items():
|
|
604
|
+
for evaluator_name, mapping in column_mapping.items():
|
|
592
605
|
mapped_to_values = set(mapping.values())
|
|
593
606
|
for col in target_generated_columns:
|
|
594
607
|
# If user defined mapping differently, do not change it.
|
|
595
608
|
# If it was mapped to target, we have already changed it
|
|
596
|
-
# in
|
|
609
|
+
# in _process_column_mappings
|
|
597
610
|
run_output = f"${{run.outputs.{col}}}"
|
|
598
611
|
# We will add our mapping only if
|
|
599
612
|
# customer did not mapped target output.
|
|
600
613
|
if col not in mapping and run_output not in mapped_to_values:
|
|
601
|
-
|
|
614
|
+
column_mapping[evaluator_name][col] = run_output # pylint: disable=unnecessary-dict-index-lookup
|
|
602
615
|
|
|
603
616
|
# After we have generated all columns we can check if we have
|
|
604
617
|
# everything we need for evaluators.
|
|
605
|
-
_validate_columns(input_data_df, evaluators, target=None,
|
|
606
|
-
|
|
618
|
+
_validate_columns(input_data_df, evaluators, target=None, column_mapping=column_mapping)
|
|
619
|
+
|
|
620
|
+
# Apply 1-1 mapping from input data to evaluator inputs, excluding values already assigned
|
|
621
|
+
# via target mapping.
|
|
622
|
+
# If both the data and the output dictionary of the target function
|
|
623
|
+
# have the same column, then the target function value is used.
|
|
624
|
+
if input_data_df is not None:
|
|
625
|
+
for col in input_data_df.columns:
|
|
626
|
+
# Ignore columns added by target mapping. These are formatted as "__outputs.<column_name>"
|
|
627
|
+
# Also ignore columns that are already in config, since they've been covered by target mapping.
|
|
628
|
+
if not col.startswith(Prefixes.TSG_OUTPUTS) and col not in column_mapping["default"].keys():
|
|
629
|
+
column_mapping["default"][col] = f"${{data.{col}}}"
|
|
607
630
|
# Batch Run
|
|
608
631
|
evaluators_info = {}
|
|
609
632
|
use_pf_client = kwargs.get("_use_pf_client", True)
|
|
610
633
|
if use_pf_client:
|
|
611
|
-
|
|
634
|
+
# A user reported intermittent errors when PFClient uploads evaluation runs to the cloud.
|
|
635
|
+
# The root cause is still unclear, but it seems related to a conflict between the async run uploader
|
|
636
|
+
# and the async batch run. As a quick mitigation, use a PFClient without a trace destination for batch runs.
|
|
637
|
+
batch_run_client = ProxyClient(PFClient(user_agent=USER_AGENT))
|
|
612
638
|
|
|
613
639
|
# Ensure the absolute path is passed to pf.run, as relative path doesn't work with
|
|
614
640
|
# multiple evaluators. If the path is already absolute, abspath will return the original path.
|
|
@@ -624,7 +650,7 @@ def _evaluate( # pylint: disable=too-many-locals
|
|
|
624
650
|
flow=evaluator,
|
|
625
651
|
run=target_run,
|
|
626
652
|
evaluator_name=evaluator_name,
|
|
627
|
-
column_mapping=
|
|
653
|
+
column_mapping=column_mapping.get(evaluator_name, column_mapping.get("default", None)),
|
|
628
654
|
data=data,
|
|
629
655
|
stream=True,
|
|
630
656
|
name=kwargs.get("_run_name"),
|
|
@@ -672,7 +698,6 @@ def _evaluate( # pylint: disable=too-many-locals
|
|
|
672
698
|
result_df = pd.concat([input_data_df, evaluators_result_df], axis=1, verify_integrity=True)
|
|
673
699
|
metrics = _aggregate_metrics(evaluators_result_df, evaluators)
|
|
674
700
|
metrics.update(evaluators_metric)
|
|
675
|
-
|
|
676
701
|
studio_url = _log_metrics_and_instance_results(
|
|
677
702
|
metrics,
|
|
678
703
|
result_df,
|
|
@@ -6,21 +6,24 @@ import functools
|
|
|
6
6
|
import inspect
|
|
7
7
|
import json
|
|
8
8
|
import logging
|
|
9
|
-
from typing import Callable, Dict
|
|
9
|
+
from typing import Callable, Dict, TypeVar
|
|
10
10
|
|
|
11
11
|
import pandas as pd
|
|
12
|
-
|
|
13
12
|
from promptflow._sdk.entities._flows import FlexFlow as flex_flow
|
|
14
13
|
from promptflow._sdk.entities._flows import Prompty as prompty_sdk
|
|
15
14
|
from promptflow._sdk.entities._flows.dag import Flow as dag_flow
|
|
16
15
|
from promptflow.client import PFClient
|
|
17
16
|
from promptflow.core import Prompty as prompty_core
|
|
17
|
+
from typing_extensions import ParamSpec
|
|
18
18
|
|
|
19
19
|
from ..._user_agent import USER_AGENT
|
|
20
20
|
from .._utils import _trace_destination_from_project_scope
|
|
21
21
|
|
|
22
22
|
LOGGER = logging.getLogger(__name__)
|
|
23
23
|
|
|
24
|
+
P = ParamSpec("P")
|
|
25
|
+
R = TypeVar("R")
|
|
26
|
+
|
|
24
27
|
|
|
25
28
|
def _get_evaluator_type(evaluator: Dict[str, Callable]):
|
|
26
29
|
"""
|
|
@@ -82,7 +85,7 @@ def _get_evaluator_properties(evaluator, evaluator_name):
|
|
|
82
85
|
name = str(evaluator)
|
|
83
86
|
pf_type = "Unknown"
|
|
84
87
|
except Exception as e: # pylint: disable=broad-exception-caught
|
|
85
|
-
LOGGER.debug(
|
|
88
|
+
LOGGER.debug("Failed to get evaluator properties: %s", e)
|
|
86
89
|
name = str(evaluator)
|
|
87
90
|
pf_type = "Unknown"
|
|
88
91
|
|
|
@@ -95,15 +98,17 @@ def _get_evaluator_properties(evaluator, evaluator_name):
|
|
|
95
98
|
|
|
96
99
|
|
|
97
100
|
# cspell:ignore isna
|
|
98
|
-
def log_evaluate_activity(func) ->
|
|
101
|
+
def log_evaluate_activity(func: Callable[P, R]) -> Callable[P, R]:
|
|
99
102
|
"""Decorator to log evaluate activity
|
|
100
103
|
|
|
101
104
|
:param func: The function to be decorated
|
|
102
105
|
:type func: Callable
|
|
106
|
+
:returns: The decorated function
|
|
107
|
+
:rtype: Callable[P, R]
|
|
103
108
|
"""
|
|
104
109
|
|
|
105
110
|
@functools.wraps(func)
|
|
106
|
-
def wrapper(*args, **kwargs) ->
|
|
111
|
+
def wrapper(*args: P.args, **kwargs: P.kwargs) -> R:
|
|
107
112
|
from promptflow._sdk._telemetry import ActivityType, log_activity
|
|
108
113
|
from promptflow._sdk._telemetry.telemetry import get_telemetry_logger
|
|
109
114
|
|
|
@@ -119,7 +124,7 @@ def log_evaluate_activity(func) -> None:
|
|
|
119
124
|
user_agent=USER_AGENT,
|
|
120
125
|
)
|
|
121
126
|
|
|
122
|
-
track_in_cloud = bool(pf_client._config.get_trace_destination())
|
|
127
|
+
track_in_cloud = bool(pf_client._config.get_trace_destination()) # pylint: disable=protected-access
|
|
123
128
|
evaluate_target = bool(kwargs.get("target", None))
|
|
124
129
|
evaluator_config = bool(kwargs.get("evaluator_config", None))
|
|
125
130
|
custom_dimensions = {
|
|
@@ -154,7 +159,7 @@ def log_evaluate_activity(func) -> None:
|
|
|
154
159
|
evaluator_info["failed_rows"] = failed_rows
|
|
155
160
|
evaluator_info["total_rows"] = total_rows
|
|
156
161
|
except Exception as e: # pylint: disable=broad-exception-caught
|
|
157
|
-
LOGGER.debug(
|
|
162
|
+
LOGGER.debug("Failed to collect evaluate failed row info for %s: %s", evaluator_name, e)
|
|
158
163
|
evaluators_info.append(evaluator_info)
|
|
159
164
|
|
|
160
165
|
custom_dimensions = {"evaluators_info": json.dumps(evaluators_info)}
|
|
@@ -167,7 +172,7 @@ def log_evaluate_activity(func) -> None:
|
|
|
167
172
|
):
|
|
168
173
|
pass
|
|
169
174
|
except Exception as e: # pylint: disable=broad-exception-caught
|
|
170
|
-
LOGGER.debug(
|
|
175
|
+
LOGGER.debug("Failed to collect evaluate usage info: %s", e)
|
|
171
176
|
|
|
172
177
|
return result
|
|
173
178
|
|
|
@@ -8,12 +8,13 @@ import re
|
|
|
8
8
|
import tempfile
|
|
9
9
|
from collections import namedtuple
|
|
10
10
|
from pathlib import Path
|
|
11
|
+
from typing import Dict
|
|
11
12
|
|
|
12
13
|
import pandas as pd
|
|
13
14
|
|
|
14
|
-
from azure.ai.evaluation._constants import DEFAULT_EVALUATION_RESULTS_FILE_NAME, Prefixes
|
|
15
|
-
from azure.ai.evaluation._exceptions import EvaluationException, ErrorBlame, ErrorCategory, ErrorTarget
|
|
15
|
+
from azure.ai.evaluation._constants import DEFAULT_EVALUATION_RESULTS_FILE_NAME, DefaultOpenEncoding, Prefixes
|
|
16
16
|
from azure.ai.evaluation._evaluate._eval_run import EvalRun
|
|
17
|
+
from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
|
|
17
18
|
|
|
18
19
|
LOGGER = logging.getLogger(__name__)
|
|
19
20
|
|
|
@@ -33,16 +34,16 @@ def extract_workspace_triad_from_trace_provider(trace_provider: str): # pylint:
|
|
|
33
34
|
match = re.match(AZURE_WORKSPACE_REGEX_FORMAT, trace_provider)
|
|
34
35
|
if not match or len(match.groups()) != 5:
|
|
35
36
|
raise EvaluationException(
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
37
|
+
message="Malformed trace provider string, expected azureml://subscriptions/<subscription_id>/"
|
|
38
|
+
"resourceGroups/<resource_group>/providers/Microsoft.MachineLearningServices/"
|
|
39
|
+
f"workspaces/<workspace_name>, got {trace_provider}",
|
|
40
|
+
internal_message="Malformed trace provider string, expected azureml://subscriptions/<subscription_id>/"
|
|
41
|
+
"resourceGroups/<resource_group>/providers/Microsoft.MachineLearningServices/"
|
|
42
|
+
"workspaces/<workspace_name>,",
|
|
43
|
+
target=ErrorTarget.UNKNOWN,
|
|
44
|
+
category=ErrorCategory.INVALID_VALUE,
|
|
45
|
+
blame=ErrorBlame.UNKNOWN,
|
|
46
|
+
)
|
|
46
47
|
subscription_id = match.group(1)
|
|
47
48
|
resource_group_name = match.group(3)
|
|
48
49
|
workspace_name = match.group(5)
|
|
@@ -50,7 +51,7 @@ def extract_workspace_triad_from_trace_provider(trace_provider: str): # pylint:
|
|
|
50
51
|
|
|
51
52
|
|
|
52
53
|
def load_jsonl(path):
|
|
53
|
-
with open(path, "r", encoding=
|
|
54
|
+
with open(path, "r", encoding=DefaultOpenEncoding.READ) as f:
|
|
54
55
|
return [json.loads(line) for line in f.readlines()]
|
|
55
56
|
|
|
56
57
|
|
|
@@ -99,7 +100,7 @@ def _log_metrics_and_instance_results(
|
|
|
99
100
|
with tempfile.TemporaryDirectory() as tmpdir:
|
|
100
101
|
tmp_path = os.path.join(tmpdir, artifact_name)
|
|
101
102
|
|
|
102
|
-
with open(tmp_path, "w", encoding=
|
|
103
|
+
with open(tmp_path, "w", encoding=DefaultOpenEncoding.WRITE) as f:
|
|
103
104
|
f.write(instance_results.to_json(orient="records", lines=True))
|
|
104
105
|
|
|
105
106
|
ev_run.log_artifact(tmpdir, artifact_name)
|
|
@@ -155,11 +156,13 @@ def _write_output(path, data_dict):
|
|
|
155
156
|
if os.path.isdir(path):
|
|
156
157
|
p = p / DEFAULT_EVALUATION_RESULTS_FILE_NAME
|
|
157
158
|
|
|
158
|
-
with open(p, "w") as f:
|
|
159
|
+
with open(p, "w", encoding=DefaultOpenEncoding.WRITE) as f:
|
|
159
160
|
json.dump(data_dict, f)
|
|
160
161
|
|
|
161
162
|
|
|
162
|
-
def _apply_column_mapping(
|
|
163
|
+
def _apply_column_mapping(
|
|
164
|
+
source_df: pd.DataFrame, mapping_config: Dict[str, str], inplace: bool = False
|
|
165
|
+
) -> pd.DataFrame:
|
|
163
166
|
"""
|
|
164
167
|
Apply column mapping to source_df based on mapping_config.
|
|
165
168
|
|
|
@@ -167,10 +170,11 @@ def _apply_column_mapping(source_df: pd.DataFrame, mapping_config: dict, inplace
|
|
|
167
170
|
:param source_df: the data frame to be changed.
|
|
168
171
|
:type source_df: pd.DataFrame
|
|
169
172
|
:param mapping_config: The configuration, containing column mapping.
|
|
170
|
-
:type mapping_config:
|
|
173
|
+
:type mapping_config: Dict[str, str].
|
|
171
174
|
:param inplace: If true, the source_df will be changed inplace.
|
|
172
175
|
:type inplace: bool
|
|
173
176
|
:return: The modified data frame.
|
|
177
|
+
:rtype: pd.DataFrame
|
|
174
178
|
"""
|
|
175
179
|
result_df = source_df
|
|
176
180
|
|
|
@@ -211,19 +215,22 @@ def _has_aggregator(evaluator):
|
|
|
211
215
|
return hasattr(evaluator, "__aggregate__")
|
|
212
216
|
|
|
213
217
|
|
|
214
|
-
def get_int_env_var(env_var_name, default_value
|
|
218
|
+
def get_int_env_var(env_var_name: str, default_value: int) -> int:
|
|
215
219
|
"""
|
|
216
|
-
The function `get_int_env_var` retrieves an integer environment variable value, with
|
|
220
|
+
The function `get_int_env_var` retrieves an integer environment variable value, with a
|
|
217
221
|
default value if the variable is not set or cannot be converted to an integer.
|
|
218
222
|
|
|
219
223
|
:param env_var_name: The name of the environment variable you want to retrieve the value of
|
|
224
|
+
:type env_var_name: str
|
|
220
225
|
:param default_value: The default value is the value that will be returned if the environment
|
|
221
|
-
|
|
226
|
+
variable is not found or if it cannot be converted to an integer
|
|
227
|
+
:type default_value: int
|
|
222
228
|
:return: an integer value.
|
|
229
|
+
:rtype: int
|
|
223
230
|
"""
|
|
224
231
|
try:
|
|
225
|
-
return int(os.environ
|
|
226
|
-
except
|
|
232
|
+
return int(os.environ[env_var_name])
|
|
233
|
+
except (ValueError, KeyError):
|
|
227
234
|
return default_value
|
|
228
235
|
|
|
229
236
|
|
|
@@ -2,8 +2,8 @@
|
|
|
2
2
|
# Copyright (c) Microsoft Corporation. All rights reserved.
|
|
3
3
|
# ---------------------------------------------------------
|
|
4
4
|
from nltk.translate.bleu_score import SmoothingFunction, sentence_bleu
|
|
5
|
-
|
|
6
5
|
from promptflow._utils.async_utils import async_run_allowing_running_loop
|
|
6
|
+
|
|
7
7
|
from azure.ai.evaluation._common.utils import nltk_tokenize
|
|
8
8
|
|
|
9
9
|
|