azure-ai-evaluation 1.0.0b2__py3-none-any.whl → 1.0.0b4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of azure-ai-evaluation might be problematic. Click here for more details.

Files changed (78) hide show
  1. azure/ai/evaluation/__init__.py +9 -5
  2. azure/ai/evaluation/_common/constants.py +4 -2
  3. azure/ai/evaluation/_common/math.py +18 -0
  4. azure/ai/evaluation/_common/rai_service.py +54 -62
  5. azure/ai/evaluation/_common/utils.py +201 -16
  6. azure/ai/evaluation/_constants.py +12 -0
  7. azure/ai/evaluation/_evaluate/_batch_run_client/batch_run_context.py +10 -3
  8. azure/ai/evaluation/_evaluate/_batch_run_client/code_client.py +33 -17
  9. azure/ai/evaluation/_evaluate/_batch_run_client/proxy_client.py +17 -2
  10. azure/ai/evaluation/_evaluate/_eval_run.py +26 -10
  11. azure/ai/evaluation/_evaluate/_evaluate.py +161 -89
  12. azure/ai/evaluation/_evaluate/_telemetry/__init__.py +16 -17
  13. azure/ai/evaluation/_evaluate/_utils.py +44 -25
  14. azure/ai/evaluation/_evaluators/_coherence/_coherence.py +33 -79
  15. azure/ai/evaluation/_evaluators/_coherence/coherence.prompty +0 -5
  16. azure/ai/evaluation/_evaluators/_common/__init__.py +13 -0
  17. azure/ai/evaluation/_evaluators/_common/_base_eval.py +331 -0
  18. azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +76 -0
  19. azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py +97 -0
  20. azure/ai/evaluation/_evaluators/_content_safety/__init__.py +0 -4
  21. azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py +15 -20
  22. azure/ai/evaluation/_evaluators/_content_safety/_content_safety_chat.py +63 -42
  23. azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py +18 -41
  24. azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py +18 -39
  25. azure/ai/evaluation/_evaluators/_content_safety/_sexual.py +18 -39
  26. azure/ai/evaluation/_evaluators/_content_safety/_violence.py +18 -39
  27. azure/ai/evaluation/_evaluators/_eci/_eci.py +18 -55
  28. azure/ai/evaluation/_evaluators/_f1_score/_f1_score.py +14 -6
  29. azure/ai/evaluation/_evaluators/_fluency/_fluency.py +30 -74
  30. azure/ai/evaluation/_evaluators/_fluency/fluency.prompty +0 -5
  31. azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +34 -80
  32. azure/ai/evaluation/_evaluators/_groundedness/groundedness.prompty +0 -5
  33. azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py +18 -65
  34. azure/ai/evaluation/_evaluators/_qa/_qa.py +4 -3
  35. azure/ai/evaluation/_evaluators/_relevance/_relevance.py +35 -83
  36. azure/ai/evaluation/_evaluators/_relevance/relevance.prompty +0 -5
  37. azure/ai/evaluation/_evaluators/{_chat → _retrieval}/__init__.py +2 -2
  38. azure/ai/evaluation/_evaluators/{_chat/retrieval → _retrieval}/_retrieval.py +25 -28
  39. azure/ai/evaluation/_evaluators/{_chat/retrieval → _retrieval}/retrieval.prompty +0 -5
  40. azure/ai/evaluation/_evaluators/_rouge/_rouge.py +1 -1
  41. azure/ai/evaluation/_evaluators/_similarity/_similarity.py +23 -17
  42. azure/ai/evaluation/_evaluators/_similarity/similarity.prompty +0 -5
  43. azure/ai/evaluation/_evaluators/_xpia/xpia.py +15 -90
  44. azure/ai/evaluation/_exceptions.py +9 -7
  45. azure/ai/evaluation/_http_utils.py +203 -132
  46. azure/ai/evaluation/_model_configurations.py +37 -9
  47. azure/ai/evaluation/{_evaluators/_chat/retrieval → _vendor}/__init__.py +0 -6
  48. azure/ai/evaluation/_vendor/rouge_score/__init__.py +14 -0
  49. azure/ai/evaluation/_vendor/rouge_score/rouge_scorer.py +328 -0
  50. azure/ai/evaluation/_vendor/rouge_score/scoring.py +63 -0
  51. azure/ai/evaluation/_vendor/rouge_score/tokenize.py +63 -0
  52. azure/ai/evaluation/_vendor/rouge_score/tokenizers.py +53 -0
  53. azure/ai/evaluation/_version.py +1 -1
  54. azure/ai/evaluation/simulator/_adversarial_simulator.py +85 -60
  55. azure/ai/evaluation/simulator/_conversation/__init__.py +13 -12
  56. azure/ai/evaluation/simulator/_conversation/_conversation.py +4 -4
  57. azure/ai/evaluation/simulator/_direct_attack_simulator.py +24 -66
  58. azure/ai/evaluation/simulator/_helpers/_experimental.py +20 -9
  59. azure/ai/evaluation/simulator/_helpers/_simulator_data_classes.py +4 -4
  60. azure/ai/evaluation/simulator/_indirect_attack_simulator.py +22 -64
  61. azure/ai/evaluation/simulator/_model_tools/_identity_manager.py +67 -21
  62. azure/ai/evaluation/simulator/_model_tools/_proxy_completion_model.py +28 -11
  63. azure/ai/evaluation/simulator/_model_tools/_template_handler.py +68 -24
  64. azure/ai/evaluation/simulator/_model_tools/models.py +10 -10
  65. azure/ai/evaluation/simulator/_prompty/task_query_response.prompty +2 -6
  66. azure/ai/evaluation/simulator/_prompty/task_simulate.prompty +0 -4
  67. azure/ai/evaluation/simulator/_simulator.py +127 -117
  68. azure/ai/evaluation/simulator/_tracing.py +4 -4
  69. {azure_ai_evaluation-1.0.0b2.dist-info → azure_ai_evaluation-1.0.0b4.dist-info}/METADATA +129 -43
  70. azure_ai_evaluation-1.0.0b4.dist-info/NOTICE.txt +50 -0
  71. azure_ai_evaluation-1.0.0b4.dist-info/RECORD +106 -0
  72. azure/ai/evaluation/_evaluators/_chat/_chat.py +0 -357
  73. azure/ai/evaluation/_evaluators/_content_safety/_content_safety_base.py +0 -65
  74. azure/ai/evaluation/_evaluators/_protected_materials/__init__.py +0 -5
  75. azure/ai/evaluation/_evaluators/_protected_materials/_protected_materials.py +0 -104
  76. azure_ai_evaluation-1.0.0b2.dist-info/RECORD +0 -99
  77. {azure_ai_evaluation-1.0.0b2.dist-info → azure_ai_evaluation-1.0.0b4.dist-info}/WHEEL +0 -0
  78. {azure_ai_evaluation-1.0.0b2.dist-info → azure_ai_evaluation-1.0.0b4.dist-info}/top_level.txt +0 -0
@@ -4,35 +4,48 @@
4
4
  import inspect
5
5
  import os
6
6
  import re
7
- from typing import Any, Callable, Dict, List, Optional, Set, Tuple, Type
7
+ from typing import Any, Callable, Dict, List, Optional, Set, Tuple, TypedDict, TypeVar, Union
8
+ import json
8
9
 
9
- import numpy as np
10
10
  import pandas as pd
11
11
  from promptflow._sdk._constants import LINE_NUMBER
12
12
  from promptflow.client import PFClient
13
+ from promptflow.entities import Run
14
+ from promptflow._sdk._errors import MissingAzurePackage
13
15
 
16
+ from azure.ai.evaluation._common.math import list_sum
14
17
  from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
15
18
 
16
19
  from .._constants import (
17
20
  CONTENT_SAFETY_DEFECT_RATE_THRESHOLD_DEFAULT,
18
21
  EvaluationMetrics,
22
+ EvaluationRunProperties,
19
23
  Prefixes,
20
24
  _InternalEvaluationMetrics,
21
25
  )
22
- from .._model_configurations import AzureAIProject
26
+ from .._model_configurations import AzureAIProject, EvaluatorConfig
23
27
  from .._user_agent import USER_AGENT
24
28
  from ._batch_run_client import BatchRunContext, CodeClient, ProxyClient
25
29
  from ._utils import (
30
+ EvaluateResult,
26
31
  _apply_column_mapping,
27
32
  _log_metrics_and_instance_results,
28
33
  _trace_destination_from_project_scope,
29
34
  _write_output,
30
35
  )
31
36
 
37
+ TClient = TypeVar("TClient", ProxyClient, CodeClient)
38
+
39
+
40
+ class __EvaluatorInfo(TypedDict):
41
+ result: pd.DataFrame
42
+ metrics: Dict[str, Any]
43
+ run_summary: Dict[str, Any]
44
+
32
45
 
33
46
  # pylint: disable=line-too-long
34
47
  def _aggregate_content_safety_metrics(
35
- df: pd.DataFrame, evaluators: Dict[str, Type]
48
+ df: pd.DataFrame, evaluators: Dict[str, Callable]
36
49
  ) -> Tuple[List[str], Dict[str, float]]:
37
50
  """Find and aggregate defect rates for content safety metrics. Returns both a list
38
51
  of columns that were used to calculate defect rates and the defect rates themselves.
@@ -73,7 +86,7 @@ def _aggregate_content_safety_metrics(
73
86
  defect_rate_name = col.replace("_score", "_defect_rate")
74
87
  col_with_numeric_values = pd.to_numeric(content_safety_df[col], errors="coerce")
75
88
  defect_rates[defect_rate_name] = round(
76
- np.sum(col_with_numeric_values >= CONTENT_SAFETY_DEFECT_RATE_THRESHOLD_DEFAULT)
89
+ list_sum(col_with_numeric_values >= CONTENT_SAFETY_DEFECT_RATE_THRESHOLD_DEFAULT)
77
90
  / col_with_numeric_values.count(),
78
91
  2,
79
92
  )
@@ -107,13 +120,13 @@ def _aggregate_label_defect_metrics(df: pd.DataFrame) -> Tuple[List[str], Dict[s
107
120
  defect_rate_name = col.replace("_label", "_defect_rate")
108
121
  col_with_boolean_values = pd.to_numeric(label_df[col], errors="coerce")
109
122
  defect_rates[defect_rate_name] = round(
110
- np.sum(col_with_boolean_values) / col_with_boolean_values.count(),
123
+ list_sum(col_with_boolean_values) / col_with_boolean_values.count(),
111
124
  2,
112
125
  )
113
126
  return label_cols, defect_rates
114
127
 
115
128
 
116
- def _aggregate_metrics(df: pd.DataFrame, evaluators: Dict[str, Type]) -> Dict[str, float]:
129
+ def _aggregate_metrics(df: pd.DataFrame, evaluators: Dict[str, Callable]) -> Dict[str, float]:
117
130
  """Aggregate metrics from the evaluation results.
118
131
  On top of naively calculating the mean of most metrics, this function also identifies certain columns
119
132
  that represent defect rates and renames them accordingly. Other columns in the dataframe are dropped.
@@ -122,7 +135,7 @@ def _aggregate_metrics(df: pd.DataFrame, evaluators: Dict[str, Type]) -> Dict[st
122
135
  :param df: The dataframe of evaluation results.
123
136
  :type df: ~pandas.DataFrame
124
137
  :param evaluators: A dictionary mapping of strings to evaluator classes.
125
- :type evaluators: Dict[str, Type]
138
+ :type evaluators: Dict[str, Callable]
126
139
  :return: The aggregated metrics.
127
140
  :rtype: Dict[str, float]
128
141
  """
@@ -158,6 +171,12 @@ def _validate_input_data_for_evaluator(evaluator, evaluator_name, df_data, is_ta
158
171
  ]
159
172
 
160
173
  missing_inputs = [col for col in required_inputs if col not in df_data.columns]
174
+ if missing_inputs and "conversation" in required_inputs:
175
+ non_conversation_inputs = [val for val in required_inputs if val != "conversation"]
176
+ if len(missing_inputs) == len(non_conversation_inputs) and [
177
+ input in non_conversation_inputs for input in missing_inputs
178
+ ]:
179
+ missing_inputs = []
161
180
  if missing_inputs:
162
181
  if not is_target_fn:
163
182
  msg = f"Missing required inputs for evaluator {evaluator_name} : {missing_inputs}."
@@ -271,9 +290,9 @@ def _validate_and_load_data(target, data, evaluators, output_path, azure_ai_proj
271
290
 
272
291
  def _validate_columns(
273
292
  df: pd.DataFrame,
274
- evaluators: Dict[str, Any],
293
+ evaluators: Dict[str, Callable],
275
294
  target: Optional[Callable],
276
- evaluator_config: Dict[str, Dict[str, str]],
295
+ column_mapping: Dict[str, Dict[str, str]],
277
296
  ) -> None:
278
297
  """
279
298
  Check that all columns needed by evaluator or target function are present.
@@ -281,11 +300,11 @@ def _validate_columns(
281
300
  :param df: The data frame to be validated.
282
301
  :type df: pd.DataFrame
283
302
  :param evaluators: The dictionary of evaluators.
284
- :type evaluators: Dict[str, Any]
303
+ :type evaluators: Dict[str, Callable]
285
304
  :param target: The callable to be applied to data set.
286
305
  :type target: Optional[Callable]
287
- :param evaluator_config: The configuration for evaluators.
288
- :type evaluator_config: Dict[str, Dict[str, str]]
306
+ :param column_mapping: Dictionary mapping evaluator name to evaluator column mapping
307
+ :type column_mapping: Dict[str, Dict[str, str]]
289
308
  :raises EvaluationException: If column starts from "__outputs." while target is defined.
290
309
  """
291
310
  if target:
@@ -306,7 +325,7 @@ def _validate_columns(
306
325
  else:
307
326
  for evaluator_name, evaluator in evaluators.items():
308
327
  # Apply column mapping
309
- mapping_config = evaluator_config.get(evaluator_name, evaluator_config.get("default", None))
328
+ mapping_config = column_mapping.get(evaluator_name, column_mapping.get("default", None))
310
329
  new_df = _apply_column_mapping(df, mapping_config)
311
330
 
312
331
  # Validate input data for evaluator
@@ -320,7 +339,7 @@ def _apply_target_to_data(
320
339
  initial_data: pd.DataFrame,
321
340
  evaluation_name: Optional[str] = None,
322
341
  _run_name: Optional[str] = None,
323
- ) -> Tuple[pd.DataFrame, Set[str]]:
342
+ ) -> Tuple[pd.DataFrame, Set[str], Run]:
324
343
  """
325
344
  Apply the target function to the data set and return updated data and generated columns.
326
345
 
@@ -342,15 +361,15 @@ def _apply_target_to_data(
342
361
  # We are manually creating the temporary directory for the flow
343
362
  # because the way tempdir remove temporary directories will
344
363
  # hang the debugger, because promptflow will keep flow directory.
345
- run = pf_client.run(
364
+ run: Run = pf_client.run(
346
365
  flow=target,
347
366
  display_name=evaluation_name,
348
367
  data=data,
349
- properties={"runType": "eval_run", "isEvaluatorRun": "true"},
368
+ properties={EvaluationRunProperties.RUN_TYPE: "eval_run", "isEvaluatorRun": "true"},
350
369
  stream=True,
351
370
  name=_run_name,
352
371
  )
353
- target_output = pf_client.runs.get_details(run, all_results=True)
372
+ target_output: pd.DataFrame = pf_client.runs.get_details(run, all_results=True)
354
373
  # Remove input and output prefix
355
374
  generated_columns = {
356
375
  col[len(Prefixes.OUTPUTS) :] for col in target_output.columns if col.startswith(Prefixes.OUTPUTS)
@@ -372,28 +391,30 @@ def _apply_target_to_data(
372
391
  return target_output, generated_columns, run
373
392
 
374
393
 
375
- def _process_evaluator_config(evaluator_config: Dict[str, Dict[str, str]]) -> Dict[str, Dict[str, str]]:
376
- """Process evaluator_config to replace ${target.} with ${data.}
394
+ def _process_column_mappings(
395
+ column_mapping: Dict[str, Optional[Dict[str, str]]],
396
+ ) -> Dict[str, Dict[str, str]]:
397
+ """Process column_mapping to replace ${target.} with ${data.}
377
398
 
378
- :param evaluator_config: The configuration for evaluators.
379
- :type evaluator_config: Dict[str, Dict[str, str]]
399
+ :param column_mapping: The configuration for evaluators.
400
+ :type column_mapping: Dict[str, Optional[Dict[str, str]]]
380
401
  :return: The processed configuration.
381
402
  :rtype: Dict[str, Dict[str, str]]
382
403
  """
383
404
 
384
- processed_config = {}
405
+ processed_config: Dict[str, Dict[str, str]] = {}
385
406
 
386
407
  unexpected_references = re.compile(r"\${(?!target\.|data\.).+?}")
387
408
 
388
- if evaluator_config:
389
- for evaluator, mapping_config in evaluator_config.items():
409
+ if column_mapping:
410
+ for evaluator, mapping_config in column_mapping.items():
390
411
  if isinstance(mapping_config, dict):
391
412
  processed_config[evaluator] = {}
392
413
 
393
414
  for map_to_key, map_value in mapping_config.items():
394
415
  # Check if there's any unexpected reference other than ${target.} or ${data.}
395
416
  if unexpected_references.search(map_value):
396
- msg = "Unexpected references detected in 'evaluator_config'. Ensure only ${target.} and ${data.} are used."
417
+ msg = "Unexpected references detected in 'column_mapping'. Ensure only ${target.} and ${data.} are used."
397
418
  raise EvaluationException(
398
419
  message=msg,
399
420
  internal_message=msg,
@@ -439,7 +460,7 @@ def evaluate(
439
460
  evaluators: Dict[str, Callable],
440
461
  evaluation_name: Optional[str] = None,
441
462
  target: Optional[Callable] = None,
442
- evaluator_config: Optional[Dict[str, Dict[str, str]]] = None,
463
+ evaluator_config: Optional[Dict[str, EvaluatorConfig]] = None,
443
464
  azure_ai_project: Optional[AzureAIProject] = None,
444
465
  output_path: Optional[str] = None,
445
466
  **kwargs,
@@ -458,10 +479,10 @@ def evaluate(
458
479
  :keyword target: Target to be evaluated. `target` and `data` both cannot be None
459
480
  :paramtype target: Optional[Callable]
460
481
  :keyword evaluator_config: Configuration for evaluators. The configuration should be a dictionary with evaluator
461
- names as keys and a dictionary of column mappings as values. The column mappings should be a dictionary with
462
- keys as the column names in the evaluator input and values as the column names in the input data or data
463
- generated by target.
464
- :paramtype evaluator_config: Optional[Dict[str, Dict[str, str]]
482
+ names as keys and a values that are dictionaries containing the column mappings. The column mappings should
483
+ be a dictionary with keys as the column names in the evaluator input and values as the column names in the
484
+ input data or data generated by target.
485
+ :paramtype evaluator_config: Optional[Dict[str, ~azure.ai.evaluation.EvaluatorConfig]]
465
486
  :keyword output_path: The local folder or file path to save evaluation results to if set. If folder path is provided
466
487
  the results will be saved to a file named `evaluation_results.json` in the folder.
467
488
  :paramtype output_path: Optional[str]
@@ -482,7 +503,7 @@ def evaluate(
482
503
  model_config = {
483
504
  "azure_endpoint": os.environ.get("AZURE_OPENAI_ENDPOINT"),
484
505
  "api_key": os.environ.get("AZURE_OPENAI_KEY"),
485
- "azure_deployment": os.environ.get("AZURE_OPENAI_DEPLOYMENT")
506
+ "azure_deployment": os.environ.get("AZURE_OPENAI_DEPLOYMENT"),
486
507
  }
487
508
 
488
509
  coherence_eval = CoherenceEvaluator(model_config=model_config)
@@ -497,15 +518,19 @@ def evaluate(
497
518
  },
498
519
  evaluator_config={
499
520
  "coherence": {
500
- "response": "${data.response}",
501
- "query": "${data.query}"
521
+ "column_mapping": {
522
+ "response": "${data.response}",
523
+ "query": "${data.query}",
524
+ },
502
525
  },
503
526
  "relevance": {
504
- "response": "${data.response}",
505
- "context": "${data.context}",
506
- "query": "${data.query}"
507
- }
508
- }
527
+ "column_mapping": {
528
+ "response": "${data.response}",
529
+ "context": "${data.context}",
530
+ "query": "${data.query}",
531
+ },
532
+ },
533
+ },
509
534
  )
510
535
 
511
536
  """
@@ -544,41 +569,74 @@ def evaluate(
544
569
  raise e
545
570
 
546
571
 
547
- def _evaluate( # pylint: disable=too-many-locals
572
+ def _print_summary(per_evaluator_results: Dict[str, Any]) -> None:
573
+ # Extract evaluators with a non-empty "run_summary"
574
+ output_dict = {
575
+ name: result["run_summary"] for name, result in per_evaluator_results.items() if result.get("run_summary")
576
+ }
577
+
578
+ if output_dict:
579
+ print("======= Combined Run Summary (Per Evaluator) =======\n")
580
+ print(json.dumps(output_dict, indent=4))
581
+ print("\n====================================================")
582
+
583
+
584
+ def _evaluate( # pylint: disable=too-many-locals,too-many-statements
548
585
  *,
586
+ evaluators: Dict[str, Callable],
549
587
  evaluation_name: Optional[str] = None,
550
588
  target: Optional[Callable] = None,
551
- data: Optional[str] = None,
552
- evaluators: Optional[Dict[str, Callable]] = None,
553
- evaluator_config: Optional[Dict[str, Dict[str, str]]] = None,
589
+ data: str,
590
+ evaluator_config: Optional[Dict[str, EvaluatorConfig]] = None,
554
591
  azure_ai_project: Optional[AzureAIProject] = None,
555
592
  output_path: Optional[str] = None,
556
593
  **kwargs,
557
- ):
594
+ ) -> EvaluateResult:
558
595
  input_data_df = _validate_and_load_data(target, data, evaluators, output_path, azure_ai_project, evaluation_name)
559
596
 
560
597
  # Process evaluator config to replace ${target.} with ${data.}
561
598
  if evaluator_config is None:
562
599
  evaluator_config = {}
563
- evaluator_config = _process_evaluator_config(evaluator_config)
564
- _validate_columns(input_data_df, evaluators, target, evaluator_config)
600
+ # extract column mapping dicts into dictionary mapping evaluator name to column mapping
601
+ column_mapping = _process_column_mappings(
602
+ {
603
+ evaluator_name: evaluator_configuration.get("column_mapping", None)
604
+ for evaluator_name, evaluator_configuration in evaluator_config.items()
605
+ }
606
+ )
607
+ _validate_columns(input_data_df, evaluators, target, column_mapping)
565
608
 
566
609
  # Target Run
567
- pf_client = PFClient(
568
- config=(
569
- {"trace.destination": _trace_destination_from_project_scope(azure_ai_project)} if azure_ai_project else None
570
- ),
571
- user_agent=USER_AGENT,
572
- )
610
+ try:
611
+ pf_client = PFClient(
612
+ config=(
613
+ {"trace.destination": _trace_destination_from_project_scope(azure_ai_project)}
614
+ if azure_ai_project
615
+ else None
616
+ ),
617
+ user_agent=USER_AGENT,
618
+ )
619
+ # pylint: disable=raise-missing-from
620
+ except MissingAzurePackage:
621
+ msg = (
622
+ "The required packages for remote tracking are missing.\n"
623
+ 'To resolve this, please install them by running "pip install azure-ai-evaluation[remote]".'
624
+ )
573
625
 
574
- trace_destination = pf_client._config.get_trace_destination() # pylint: disable=protected-access
575
- target_run = None
576
- target_generated_columns = set()
626
+ raise EvaluationException(
627
+ message=msg,
628
+ target=ErrorTarget.EVALUATE,
629
+ category=ErrorCategory.MISSING_PACKAGE,
630
+ blame=ErrorBlame.USER_ERROR,
631
+ )
632
+
633
+ trace_destination: Optional[str] = pf_client._config.get_trace_destination() # pylint: disable=protected-access
634
+ target_run: Optional[Run] = None
577
635
 
578
636
  # Create default configuration for evaluators that directly maps
579
637
  # input data names to keyword inputs of the same name in the evaluators.
580
- evaluator_config = evaluator_config or {}
581
- evaluator_config.setdefault("default", {})
638
+ column_mapping = column_mapping or {}
639
+ column_mapping.setdefault("default", {})
582
640
 
583
641
  # If target is set, apply 1-1 column mapping from target outputs to evaluator inputs
584
642
  if data is not None and target is not None:
@@ -586,21 +644,21 @@ def _evaluate( # pylint: disable=too-many-locals
586
644
  target, data, pf_client, input_data_df, evaluation_name, _run_name=kwargs.get("_run_name")
587
645
  )
588
646
 
589
- for evaluator_name, mapping in evaluator_config.items():
647
+ for evaluator_name, mapping in column_mapping.items():
590
648
  mapped_to_values = set(mapping.values())
591
649
  for col in target_generated_columns:
592
650
  # If user defined mapping differently, do not change it.
593
651
  # If it was mapped to target, we have already changed it
594
- # in _process_evaluator_config
652
+ # in _process_column_mappings
595
653
  run_output = f"${{run.outputs.{col}}}"
596
654
  # We will add our mapping only if
597
655
  # customer did not mapped target output.
598
656
  if col not in mapping and run_output not in mapped_to_values:
599
- evaluator_config[evaluator_name][col] = run_output # pylint: disable=unnecessary-dict-index-lookup
657
+ column_mapping[evaluator_name][col] = run_output # pylint: disable=unnecessary-dict-index-lookup
600
658
 
601
659
  # After we have generated all columns we can check if we have
602
660
  # everything we need for evaluators.
603
- _validate_columns(input_data_df, evaluators, target=None, evaluator_config=evaluator_config)
661
+ _validate_columns(input_data_df, evaluators, target=None, column_mapping=column_mapping)
604
662
 
605
663
  # Apply 1-1 mapping from input data to evaluator inputs, excluding values already assigned
606
664
  # via target mapping.
@@ -610,44 +668,56 @@ def _evaluate( # pylint: disable=too-many-locals
610
668
  for col in input_data_df.columns:
611
669
  # Ignore columns added by target mapping. These are formatted as "__outputs.<column_name>"
612
670
  # Also ignore columns that are already in config, since they've been covered by target mapping.
613
- if not col.startswith(Prefixes.TSG_OUTPUTS) and col not in evaluator_config["default"].keys():
614
- evaluator_config["default"][col] = f"${{data.{col}}}"
671
+ if not col.startswith(Prefixes.TSG_OUTPUTS) and col not in column_mapping["default"].keys():
672
+ column_mapping["default"][col] = f"${{data.{col}}}"
673
+
674
+ def eval_batch_run(
675
+ batch_run_client: TClient, *, data=Union[str, os.PathLike, pd.DataFrame]
676
+ ) -> Dict[str, __EvaluatorInfo]:
677
+ with BatchRunContext(batch_run_client):
678
+ runs = {
679
+ evaluator_name: batch_run_client.run(
680
+ flow=evaluator,
681
+ run=target_run,
682
+ evaluator_name=evaluator_name,
683
+ column_mapping=column_mapping.get(evaluator_name, column_mapping.get("default", None)),
684
+ data=data,
685
+ stream=True,
686
+ name=kwargs.get("_run_name"),
687
+ )
688
+ for evaluator_name, evaluator in evaluators.items()
689
+ }
690
+
691
+ # get_details needs to be called within BatchRunContext scope in order to have user agent populated
692
+ return {
693
+ evaluator_name: {
694
+ "result": batch_run_client.get_details(run, all_results=True),
695
+ "metrics": batch_run_client.get_metrics(run),
696
+ "run_summary": batch_run_client.get_run_summary(run),
697
+ }
698
+ for evaluator_name, run in runs.items()
699
+ }
700
+
615
701
  # Batch Run
616
- evaluators_info = {}
617
702
  use_pf_client = kwargs.get("_use_pf_client", True)
618
703
  if use_pf_client:
619
- batch_run_client = ProxyClient(pf_client)
620
-
621
704
  # Ensure the absolute path is passed to pf.run, as relative path doesn't work with
622
705
  # multiple evaluators. If the path is already absolute, abspath will return the original path.
623
706
  data = os.path.abspath(data)
707
+
708
+ # A user reported intermittent errors when PFClient uploads evaluation runs to the cloud.
709
+ # The root cause is still unclear, but it seems related to a conflict between the async run uploader
710
+ # and the async batch run. As a quick mitigation, use a PFClient without a trace destination for batch runs.
711
+ per_evaluator_results = eval_batch_run(ProxyClient(PFClient(user_agent=USER_AGENT)), data=data)
624
712
  else:
625
- batch_run_client = CodeClient()
626
713
  data = input_data_df
627
-
628
- with BatchRunContext(batch_run_client):
629
- for evaluator_name, evaluator in evaluators.items():
630
- evaluators_info[evaluator_name] = {}
631
- evaluators_info[evaluator_name]["run"] = batch_run_client.run(
632
- flow=evaluator,
633
- run=target_run,
634
- evaluator_name=evaluator_name,
635
- column_mapping=evaluator_config.get(evaluator_name, evaluator_config.get("default", None)),
636
- data=data,
637
- stream=True,
638
- name=kwargs.get("_run_name"),
639
- )
640
-
641
- # get_details needs to be called within BatchRunContext scope in order to have user agent populated
642
- for evaluator_name, evaluator_info in evaluators_info.items():
643
- evaluator_info["result"] = batch_run_client.get_details(evaluator_info["run"], all_results=True)
644
- evaluator_info["metrics"] = batch_run_client.get_metrics(evaluator_info["run"])
714
+ per_evaluator_results = eval_batch_run(CodeClient(), data=input_data_df)
645
715
 
646
716
  # Concatenate all results
647
717
  evaluators_result_df = None
648
718
  evaluators_metric = {}
649
- for evaluator_name, evaluator_info in evaluators_info.items():
650
- evaluator_result_df = evaluator_info["result"]
719
+ for evaluator_name, evaluator_result in per_evaluator_results.items():
720
+ evaluator_result_df = evaluator_result["result"]
651
721
 
652
722
  # drop input columns
653
723
  evaluator_result_df = evaluator_result_df.drop(
@@ -670,7 +740,7 @@ def _evaluate( # pylint: disable=too-many-locals
670
740
  else evaluator_result_df
671
741
  )
672
742
 
673
- evaluators_metric.update({f"{evaluator_name}.{k}": v for k, v in evaluator_info["metrics"].items()})
743
+ evaluators_metric.update({f"{evaluator_name}.{k}": v for k, v in evaluator_result["metrics"].items()})
674
744
 
675
745
  # Rename columns, generated by target function to outputs instead of inputs.
676
746
  # If target generates columns, already present in the input data, these columns
@@ -688,9 +758,11 @@ def _evaluate( # pylint: disable=too-many-locals
688
758
  evaluation_name,
689
759
  )
690
760
 
691
- result = {"rows": result_df.to_dict("records"), "metrics": metrics, "studio_url": studio_url}
761
+ result: EvaluateResult = {"rows": result_df.to_dict("records"), "metrics": metrics, "studio_url": studio_url}
692
762
 
693
763
  if output_path:
694
764
  _write_output(output_path, result)
695
765
 
766
+ _print_summary(per_evaluator_results)
767
+
696
768
  return result
@@ -6,7 +6,7 @@ import functools
6
6
  import inspect
7
7
  import json
8
8
  import logging
9
- from typing import Callable, Dict, TypeVar
9
+ from typing import Callable, Dict, Literal, Optional, Union, cast
10
10
 
11
11
  import pandas as pd
12
12
  from promptflow._sdk.entities._flows import FlexFlow as flex_flow
@@ -16,31 +16,30 @@ from promptflow.client import PFClient
16
16
  from promptflow.core import Prompty as prompty_core
17
17
  from typing_extensions import ParamSpec
18
18
 
19
+ from azure.ai.evaluation._model_configurations import AzureAIProject
20
+
19
21
  from ..._user_agent import USER_AGENT
20
- from .._utils import _trace_destination_from_project_scope
22
+ from .._utils import EvaluateResult, _trace_destination_from_project_scope
21
23
 
22
24
  LOGGER = logging.getLogger(__name__)
23
25
 
24
26
  P = ParamSpec("P")
25
- R = TypeVar("R")
26
27
 
27
28
 
28
- def _get_evaluator_type(evaluator: Dict[str, Callable]):
29
+ def _get_evaluator_type(evaluator: Dict[str, Callable]) -> Literal["content-safety", "built-in", "custom"]:
29
30
  """
30
31
  Get evaluator type for telemetry.
31
32
 
32
33
  :param evaluator: The evaluator object
33
34
  :type evaluator: Dict[str, Callable]
34
35
  :return: The evaluator type. Possible values are "built-in", "custom", and "content-safety".
35
- :rtype: str
36
+ :rtype: Literal["content-safety", "built-in", "custom"]
36
37
  """
37
- built_in = False
38
- content_safety = False
39
-
40
38
  module = inspect.getmodule(evaluator)
41
- built_in = module and module.__name__.startswith("azure.ai.evaluation._evaluators.")
42
- if built_in:
43
- content_safety = module.__name__.startswith("azure.ai.evaluation._evaluators._content_safety")
39
+ module_name = module.__name__ if module else ""
40
+
41
+ built_in = module_name.startswith("azure.ai.evaluation._evaluators.")
42
+ content_safety = built_in and module_name.startswith("azure.ai.evaluation._evaluators._content_safety")
44
43
 
45
44
  if content_safety:
46
45
  return "content-safety"
@@ -98,22 +97,22 @@ def _get_evaluator_properties(evaluator, evaluator_name):
98
97
 
99
98
 
100
99
  # cspell:ignore isna
101
- def log_evaluate_activity(func: Callable[P, R]) -> Callable[P, R]:
100
+ def log_evaluate_activity(func: Callable[P, EvaluateResult]) -> Callable[P, EvaluateResult]:
102
101
  """Decorator to log evaluate activity
103
102
 
104
103
  :param func: The function to be decorated
105
104
  :type func: Callable
106
105
  :returns: The decorated function
107
- :rtype: Callable[P, R]
106
+ :rtype: Callable[P, EvaluateResult]
108
107
  """
109
108
 
110
109
  @functools.wraps(func)
111
- def wrapper(*args: P.args, **kwargs: P.kwargs) -> R:
110
+ def wrapper(*args: P.args, **kwargs: P.kwargs) -> EvaluateResult:
112
111
  from promptflow._sdk._telemetry import ActivityType, log_activity
113
112
  from promptflow._sdk._telemetry.telemetry import get_telemetry_logger
114
113
 
115
- evaluators = kwargs.get("evaluators", [])
116
- azure_ai_project = kwargs.get("azure_ai_project", None)
114
+ evaluators = cast(Optional[Dict[str, Callable]], kwargs.get("evaluators", {})) or {}
115
+ azure_ai_project = cast(Optional[AzureAIProject], kwargs.get("azure_ai_project", None))
117
116
 
118
117
  pf_client = PFClient(
119
118
  config=(
@@ -127,7 +126,7 @@ def log_evaluate_activity(func: Callable[P, R]) -> Callable[P, R]:
127
126
  track_in_cloud = bool(pf_client._config.get_trace_destination()) # pylint: disable=protected-access
128
127
  evaluate_target = bool(kwargs.get("target", None))
129
128
  evaluator_config = bool(kwargs.get("evaluator_config", None))
130
- custom_dimensions = {
129
+ custom_dimensions: Dict[str, Union[str, bool]] = {
131
130
  "track_in_cloud": track_in_cloud,
132
131
  "evaluate_target": evaluate_target,
133
132
  "evaluator_config": evaluator_config,