azure-ai-evaluation 1.0.0b3__py3-none-any.whl → 1.0.0b4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of azure-ai-evaluation might be problematic. Click here for more details.

Files changed (64) hide show
  1. azure/ai/evaluation/__init__.py +1 -1
  2. azure/ai/evaluation/_common/constants.py +4 -2
  3. azure/ai/evaluation/_common/math.py +18 -0
  4. azure/ai/evaluation/_common/rai_service.py +54 -62
  5. azure/ai/evaluation/_common/utils.py +182 -12
  6. azure/ai/evaluation/_constants.py +10 -2
  7. azure/ai/evaluation/_evaluate/_batch_run_client/batch_run_context.py +10 -3
  8. azure/ai/evaluation/_evaluate/_batch_run_client/code_client.py +33 -17
  9. azure/ai/evaluation/_evaluate/_batch_run_client/proxy_client.py +17 -2
  10. azure/ai/evaluation/_evaluate/_eval_run.py +26 -10
  11. azure/ai/evaluation/_evaluate/_evaluate.py +116 -62
  12. azure/ai/evaluation/_evaluate/_telemetry/__init__.py +16 -17
  13. azure/ai/evaluation/_evaluate/_utils.py +44 -25
  14. azure/ai/evaluation/_evaluators/_coherence/_coherence.py +3 -2
  15. azure/ai/evaluation/_evaluators/_common/_base_eval.py +59 -30
  16. azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +10 -13
  17. azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py +18 -20
  18. azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py +15 -20
  19. azure/ai/evaluation/_evaluators/_content_safety/_content_safety_chat.py +63 -42
  20. azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py +4 -4
  21. azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py +4 -4
  22. azure/ai/evaluation/_evaluators/_content_safety/_sexual.py +4 -4
  23. azure/ai/evaluation/_evaluators/_content_safety/_violence.py +4 -4
  24. azure/ai/evaluation/_evaluators/_eci/_eci.py +4 -4
  25. azure/ai/evaluation/_evaluators/_f1_score/_f1_score.py +14 -6
  26. azure/ai/evaluation/_evaluators/_fluency/_fluency.py +3 -2
  27. azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +3 -2
  28. azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py +4 -4
  29. azure/ai/evaluation/_evaluators/_qa/_qa.py +4 -3
  30. azure/ai/evaluation/_evaluators/_relevance/_relevance.py +3 -2
  31. azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py +11 -8
  32. azure/ai/evaluation/_evaluators/_rouge/_rouge.py +1 -1
  33. azure/ai/evaluation/_evaluators/_similarity/_similarity.py +21 -7
  34. azure/ai/evaluation/_evaluators/_xpia/xpia.py +4 -5
  35. azure/ai/evaluation/_exceptions.py +9 -6
  36. azure/ai/evaluation/_http_utils.py +203 -132
  37. azure/ai/evaluation/_model_configurations.py +5 -5
  38. azure/ai/evaluation/_vendor/__init__.py +3 -0
  39. azure/ai/evaluation/_vendor/rouge_score/__init__.py +14 -0
  40. azure/ai/evaluation/_vendor/rouge_score/rouge_scorer.py +328 -0
  41. azure/ai/evaluation/_vendor/rouge_score/scoring.py +63 -0
  42. azure/ai/evaluation/_vendor/rouge_score/tokenize.py +63 -0
  43. azure/ai/evaluation/_vendor/rouge_score/tokenizers.py +53 -0
  44. azure/ai/evaluation/_version.py +1 -1
  45. azure/ai/evaluation/simulator/_adversarial_simulator.py +85 -60
  46. azure/ai/evaluation/simulator/_conversation/__init__.py +13 -12
  47. azure/ai/evaluation/simulator/_conversation/_conversation.py +4 -4
  48. azure/ai/evaluation/simulator/_direct_attack_simulator.py +24 -66
  49. azure/ai/evaluation/simulator/_helpers/_experimental.py +20 -9
  50. azure/ai/evaluation/simulator/_helpers/_simulator_data_classes.py +4 -4
  51. azure/ai/evaluation/simulator/_indirect_attack_simulator.py +22 -64
  52. azure/ai/evaluation/simulator/_model_tools/_identity_manager.py +67 -21
  53. azure/ai/evaluation/simulator/_model_tools/_proxy_completion_model.py +28 -11
  54. azure/ai/evaluation/simulator/_model_tools/_template_handler.py +68 -24
  55. azure/ai/evaluation/simulator/_model_tools/models.py +10 -10
  56. azure/ai/evaluation/simulator/_prompty/task_query_response.prompty +0 -5
  57. azure/ai/evaluation/simulator/_prompty/task_simulate.prompty +0 -4
  58. azure/ai/evaluation/simulator/_simulator.py +112 -113
  59. azure/ai/evaluation/simulator/_tracing.py +4 -4
  60. {azure_ai_evaluation-1.0.0b3.dist-info → azure_ai_evaluation-1.0.0b4.dist-info}/METADATA +72 -44
  61. azure_ai_evaluation-1.0.0b4.dist-info/NOTICE.txt +50 -0
  62. {azure_ai_evaluation-1.0.0b3.dist-info → azure_ai_evaluation-1.0.0b4.dist-info}/RECORD +64 -56
  63. {azure_ai_evaluation-1.0.0b3.dist-info → azure_ai_evaluation-1.0.0b4.dist-info}/WHEEL +0 -0
  64. {azure_ai_evaluation-1.0.0b3.dist-info → azure_ai_evaluation-1.0.0b4.dist-info}/top_level.txt +0 -0
@@ -5,8 +5,9 @@ import inspect
5
5
  import json
6
6
  import logging
7
7
  import os
8
+ from concurrent.futures import Future
8
9
  from pathlib import Path
9
- from typing import Callable, Dict, Optional, Union
10
+ from typing import Any, Callable, Dict, Optional, Union, cast
10
11
 
11
12
  import pandas as pd
12
13
  from promptflow.contracts.types import AttrDict
@@ -22,25 +23,31 @@ LOGGER = logging.getLogger(__name__)
22
23
 
23
24
  class CodeRun:
24
25
  def __init__(
25
- self, run, input_data, evaluator_name=None, aggregated_metrics=None, **kwargs # pylint: disable=unused-argument
26
- ):
26
+ self,
27
+ *,
28
+ run: Future,
29
+ input_data,
30
+ evaluator_name: Optional[str] = None,
31
+ aggregator: Callable[["CodeRun"], Future],
32
+ **kwargs, # pylint: disable=unused-argument
33
+ ) -> None:
27
34
  self.run = run
28
35
  self.evaluator_name = evaluator_name if evaluator_name is not None else ""
29
36
  self.input_data = input_data
30
- self.aggregated_metrics = aggregated_metrics
37
+ self.aggregated_metrics = aggregator(self)
31
38
 
32
- def get_result_df(self, exclude_inputs=False):
39
+ def get_result_df(self, exclude_inputs: bool = False) -> pd.DataFrame:
33
40
  batch_run_timeout = get_int_env_var(PF_BATCH_TIMEOUT_SEC, PF_BATCH_TIMEOUT_SEC_DEFAULT)
34
- result_df = self.run.result(timeout=batch_run_timeout)
41
+ result_df = cast(pd.DataFrame, self.run.result(timeout=batch_run_timeout))
35
42
  if exclude_inputs:
36
43
  result_df = result_df.drop(columns=[col for col in result_df.columns if col.startswith("inputs.")])
37
44
  return result_df
38
45
 
39
- def get_aggregated_metrics(self):
46
+ def get_aggregated_metrics(self) -> Dict[str, Any]:
40
47
  try:
41
48
  batch_run_timeout = get_int_env_var(PF_BATCH_TIMEOUT_SEC, PF_BATCH_TIMEOUT_SEC_DEFAULT)
42
- aggregated_metrics = (
43
- self.aggregated_metrics.result(timeout=batch_run_timeout)
49
+ aggregated_metrics: Optional[Any] = (
50
+ cast(Dict, self.aggregated_metrics.result(timeout=batch_run_timeout))
44
51
  if self.aggregated_metrics is not None
45
52
  else None
46
53
  )
@@ -104,10 +111,10 @@ class CodeClient: # pylint: disable=client-accepts-api-version-keyword
104
111
  verify_integrity=True,
105
112
  )
106
113
 
107
- def _calculate_aggregations(self, evaluator, run):
114
+ @staticmethod
115
+ def _calculate_aggregations(evaluator: Callable, run: CodeRun) -> Any:
108
116
  try:
109
117
  if _has_aggregator(evaluator):
110
- aggregate_input = None
111
118
  evaluator_output = run.get_result_df(exclude_inputs=True)
112
119
  if len(evaluator_output.columns) == 1 and evaluator_output.columns[0] == "output":
113
120
  aggregate_input = evaluator_output["output"].tolist()
@@ -152,21 +159,30 @@ class CodeClient: # pylint: disable=client-accepts-api-version-keyword
152
159
  column_mapping=column_mapping,
153
160
  evaluator_name=evaluator_name,
154
161
  )
155
- run = CodeRun(run=eval_future, input_data=data, evaluator_name=evaluator_name, aggregated_metrics=None)
156
- aggregation_future = self._thread_pool.submit(self._calculate_aggregations, evaluator=flow, run=run)
157
- run.aggregated_metrics = aggregation_future
158
- return run
162
+
163
+ return CodeRun(
164
+ run=eval_future,
165
+ input_data=data,
166
+ evaluator_name=evaluator_name,
167
+ aggregator=lambda code_run: self._thread_pool.submit(
168
+ self._calculate_aggregations, evaluator=flow, run=code_run
169
+ ),
170
+ )
159
171
 
160
172
  def get_details(self, run: CodeRun, all_results: bool = False) -> pd.DataFrame:
161
173
  result_df = run.get_result_df(exclude_inputs=not all_results)
162
174
  return result_df
163
175
 
164
- def get_metrics(self, run: CodeRun) -> Optional[None]:
176
+ def get_metrics(self, run: CodeRun) -> Dict[str, Any]:
165
177
  try:
166
178
  aggregated_metrics = run.get_aggregated_metrics()
167
179
  print("Aggregated metrics")
168
180
  print(aggregated_metrics)
169
181
  except Exception as ex: # pylint: disable=broad-exception-caught
170
182
  LOGGER.debug("Error calculating metrics for evaluator %s, failed with error %s", run.evaluator_name, ex)
171
- return None
183
+ return {}
172
184
  return aggregated_metrics
185
+
186
+ def get_run_summary(self, run: CodeRun) -> Any: # pylint: disable=unused-argument
187
+ # Not implemented
188
+ return None
@@ -3,11 +3,12 @@
3
3
  # ---------------------------------------------------------
4
4
  import inspect
5
5
  import logging
6
+ import math
6
7
  import os
7
8
  from concurrent.futures import Future
8
9
  from typing import Any, Callable, Dict, Optional, Union
10
+ from collections import OrderedDict
9
11
 
10
- import numpy as np
11
12
  import pandas as pd
12
13
  from promptflow.client import PFClient
13
14
  from promptflow.entities import Run
@@ -53,13 +54,27 @@ class ProxyClient: # pylint: disable=client-accepts-api-version-keyword
53
54
  def get_details(self, proxy_run: ProxyRun, all_results: bool = False) -> pd.DataFrame:
54
55
  run: Run = proxy_run.run.result()
55
56
  result_df = self._pf_client.get_details(run, all_results=all_results)
56
- result_df.replace("(Failed)", np.nan, inplace=True)
57
+ result_df.replace("(Failed)", math.nan, inplace=True)
57
58
  return result_df
58
59
 
59
60
  def get_metrics(self, proxy_run: ProxyRun) -> Dict[str, Any]:
60
61
  run: Run = proxy_run.run.result()
61
62
  return self._pf_client.get_metrics(run)
62
63
 
64
+ def get_run_summary(self, proxy_run: ProxyRun) -> Dict[str, Any]:
65
+ run = proxy_run.run.result()
66
+
67
+ # pylint: disable=protected-access
68
+ return OrderedDict(
69
+ [
70
+ ("status", run.status),
71
+ ("duration", str(run._end_time - run._created_on)),
72
+ ("completed_lines", run._properties.get("system_metrics", {}).get("__pf__.lines.completed", "NA")),
73
+ ("failed_lines", run._properties.get("system_metrics", {}).get("__pf__.lines.failed", "NA")),
74
+ ("log_path", str(run._output_path)),
75
+ ]
76
+ )
77
+
63
78
  @staticmethod
64
79
  def _should_batch_use_async(flow):
65
80
  if os.getenv("PF_EVALS_BATCH_USE_ASYNC", "true").lower() == "true":
@@ -10,10 +10,11 @@ import posixpath
10
10
  import time
11
11
  import types
12
12
  import uuid
13
- from typing import Any, Dict, Optional, Set, Type
13
+ from typing import Any, Dict, List, Optional, Set, Type
14
14
  from urllib.parse import urlparse
15
15
 
16
16
  from promptflow._sdk.entities import Run
17
+ from typing_extensions import Self
17
18
 
18
19
  from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
19
20
  from azure.ai.evaluation._http_utils import get_http_client
@@ -27,6 +28,7 @@ LOGGER = logging.getLogger(__name__)
27
28
  # Handle optional import. The azure libraries are only present if
28
29
  # promptflow-azure is installed.
29
30
  try:
31
+ from azure.ai.ml import MLClient
30
32
  from azure.ai.ml.entities._credentials import AccountKeyConfiguration # pylint: disable=ungrouped-imports
31
33
  from azure.ai.ml.entities._datastore.datastore import Datastore
32
34
  from azure.storage.blob import BlobServiceClient
@@ -121,8 +123,8 @@ class EvalRun(contextlib.AbstractContextManager): # pylint: disable=too-many-in
121
123
  self._run_name = run_name
122
124
  self._promptflow_run = promptflow_run
123
125
  self._status = RunStatus.NOT_STARTED
124
- self._url_base = None
125
- self.info = None
126
+ self._url_base: Optional[str] = None
127
+ self._info: Optional[RunInfo] = None
126
128
 
127
129
  @property
128
130
  def status(self) -> RunStatus:
@@ -134,6 +136,20 @@ class EvalRun(contextlib.AbstractContextManager): # pylint: disable=too-many-in
134
136
  """
135
137
  return self._status
136
138
 
139
+ @property
140
+ def info(self) -> RunInfo:
141
+ if self._info is None:
142
+ msg = "Run info is missing"
143
+ raise EvaluationException(
144
+ message=msg,
145
+ internal_message=msg,
146
+ target=ErrorTarget.EVAL_RUN,
147
+ category=ErrorCategory.UNKNOWN,
148
+ blame=ErrorBlame.UNKNOWN,
149
+ )
150
+
151
+ return self._info
152
+
137
153
  def _get_scope(self) -> str:
138
154
  """
139
155
  Return the scope information for the workspace.
@@ -161,11 +177,11 @@ class EvalRun(contextlib.AbstractContextManager): # pylint: disable=too-many-in
161
177
  )
162
178
  self._url_base = None
163
179
  self._status = RunStatus.BROKEN
164
- self.info = RunInfo.generate(self._run_name)
180
+ self._info = RunInfo.generate(self._run_name)
165
181
  else:
166
182
  self._url_base = urlparse(self._tracking_uri).netloc
167
183
  if self._promptflow_run is not None:
168
- self.info = RunInfo(
184
+ self._info = RunInfo(
169
185
  self._promptflow_run.name,
170
186
  self._promptflow_run._experiment_name, # pylint: disable=protected-access
171
187
  self._promptflow_run.name,
@@ -182,7 +198,7 @@ class EvalRun(contextlib.AbstractContextManager): # pylint: disable=too-many-in
182
198
  body["run_name"] = self._run_name
183
199
  response = self.request_with_retry(url=url, method="POST", json_dict=body)
184
200
  if response.status_code != 200:
185
- self.info = RunInfo.generate(self._run_name)
201
+ self._info = RunInfo.generate(self._run_name)
186
202
  LOGGER.warning(
187
203
  "The run failed to start: %s: %s."
188
204
  "The results will be saved locally, but will not be logged to Azure.",
@@ -192,7 +208,7 @@ class EvalRun(contextlib.AbstractContextManager): # pylint: disable=too-many-in
192
208
  self._status = RunStatus.BROKEN
193
209
  else:
194
210
  parsed_response = response.json()
195
- self.info = RunInfo(
211
+ self._info = RunInfo(
196
212
  run_id=parsed_response["run"]["info"]["run_id"],
197
213
  experiment_id=parsed_response["run"]["info"]["experiment_id"],
198
214
  run_name=parsed_response["run"]["info"]["run_name"],
@@ -235,7 +251,7 @@ class EvalRun(contextlib.AbstractContextManager): # pylint: disable=too-many-in
235
251
  LOGGER.warning("Unable to terminate the run.")
236
252
  self._status = RunStatus.TERMINATED
237
253
 
238
- def __enter__(self):
254
+ def __enter__(self) -> Self:
239
255
  """The Context Manager enter call.
240
256
 
241
257
  :return: The instance of the class.
@@ -249,7 +265,7 @@ class EvalRun(contextlib.AbstractContextManager): # pylint: disable=too-many-in
249
265
  exc_type: Optional[Type[BaseException]],
250
266
  exc_value: Optional[BaseException],
251
267
  exc_tb: Optional[types.TracebackType],
252
- ) -> Optional[bool]:
268
+ ) -> None:
253
269
  """The context manager exit call.
254
270
 
255
271
  :param exc_type: The exception type
@@ -408,7 +424,7 @@ class EvalRun(contextlib.AbstractContextManager): # pylint: disable=too-many-in
408
424
  return
409
425
  # First we will list the files and the appropriate remote paths for them.
410
426
  root_upload_path = posixpath.join("promptflow", "PromptFlowArtifacts", self.info.run_name)
411
- remote_paths = {"paths": []}
427
+ remote_paths: Dict[str, List[Dict[str, str]]] = {"paths": []}
412
428
  local_paths = []
413
429
  # Go over the artifact folder and upload all artifacts.
414
430
  for root, _, filenames in os.walk(artifact_folder):
@@ -4,18 +4,22 @@
4
4
  import inspect
5
5
  import os
6
6
  import re
7
- from typing import Any, Callable, Dict, List, Optional, Set, Tuple, Type
7
+ from typing import Any, Callable, Dict, List, Optional, Set, Tuple, TypedDict, TypeVar, Union
8
+ import json
8
9
 
9
- import numpy as np
10
10
  import pandas as pd
11
11
  from promptflow._sdk._constants import LINE_NUMBER
12
12
  from promptflow.client import PFClient
13
+ from promptflow.entities import Run
14
+ from promptflow._sdk._errors import MissingAzurePackage
13
15
 
16
+ from azure.ai.evaluation._common.math import list_sum
14
17
  from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
15
18
 
16
19
  from .._constants import (
17
20
  CONTENT_SAFETY_DEFECT_RATE_THRESHOLD_DEFAULT,
18
21
  EvaluationMetrics,
22
+ EvaluationRunProperties,
19
23
  Prefixes,
20
24
  _InternalEvaluationMetrics,
21
25
  )
@@ -23,16 +27,25 @@ from .._model_configurations import AzureAIProject, EvaluatorConfig
23
27
  from .._user_agent import USER_AGENT
24
28
  from ._batch_run_client import BatchRunContext, CodeClient, ProxyClient
25
29
  from ._utils import (
30
+ EvaluateResult,
26
31
  _apply_column_mapping,
27
32
  _log_metrics_and_instance_results,
28
33
  _trace_destination_from_project_scope,
29
34
  _write_output,
30
35
  )
31
36
 
37
+ TClient = TypeVar("TClient", ProxyClient, CodeClient)
38
+
39
+
40
+ class __EvaluatorInfo(TypedDict):
41
+ result: pd.DataFrame
42
+ metrics: Dict[str, Any]
43
+ run_summary: Dict[str, Any]
44
+
32
45
 
33
46
  # pylint: disable=line-too-long
34
47
  def _aggregate_content_safety_metrics(
35
- df: pd.DataFrame, evaluators: Dict[str, Type]
48
+ df: pd.DataFrame, evaluators: Dict[str, Callable]
36
49
  ) -> Tuple[List[str], Dict[str, float]]:
37
50
  """Find and aggregate defect rates for content safety metrics. Returns both a list
38
51
  of columns that were used to calculate defect rates and the defect rates themselves.
@@ -73,7 +86,7 @@ def _aggregate_content_safety_metrics(
73
86
  defect_rate_name = col.replace("_score", "_defect_rate")
74
87
  col_with_numeric_values = pd.to_numeric(content_safety_df[col], errors="coerce")
75
88
  defect_rates[defect_rate_name] = round(
76
- np.sum(col_with_numeric_values >= CONTENT_SAFETY_DEFECT_RATE_THRESHOLD_DEFAULT)
89
+ list_sum(col_with_numeric_values >= CONTENT_SAFETY_DEFECT_RATE_THRESHOLD_DEFAULT)
77
90
  / col_with_numeric_values.count(),
78
91
  2,
79
92
  )
@@ -107,13 +120,13 @@ def _aggregate_label_defect_metrics(df: pd.DataFrame) -> Tuple[List[str], Dict[s
107
120
  defect_rate_name = col.replace("_label", "_defect_rate")
108
121
  col_with_boolean_values = pd.to_numeric(label_df[col], errors="coerce")
109
122
  defect_rates[defect_rate_name] = round(
110
- np.sum(col_with_boolean_values) / col_with_boolean_values.count(),
123
+ list_sum(col_with_boolean_values) / col_with_boolean_values.count(),
111
124
  2,
112
125
  )
113
126
  return label_cols, defect_rates
114
127
 
115
128
 
116
- def _aggregate_metrics(df: pd.DataFrame, evaluators: Dict[str, Type]) -> Dict[str, float]:
129
+ def _aggregate_metrics(df: pd.DataFrame, evaluators: Dict[str, Callable]) -> Dict[str, float]:
117
130
  """Aggregate metrics from the evaluation results.
118
131
  On top of naively calculating the mean of most metrics, this function also identifies certain columns
119
132
  that represent defect rates and renames them accordingly. Other columns in the dataframe are dropped.
@@ -122,7 +135,7 @@ def _aggregate_metrics(df: pd.DataFrame, evaluators: Dict[str, Type]) -> Dict[st
122
135
  :param df: The dataframe of evaluation results.
123
136
  :type df: ~pandas.DataFrame
124
137
  :param evaluators: A dictionary mapping of strings to evaluator classes.
125
- :type evaluators: Dict[str, Type]
138
+ :type evaluators: Dict[str, Callable]
126
139
  :return: The aggregated metrics.
127
140
  :rtype: Dict[str, float]
128
141
  """
@@ -277,7 +290,7 @@ def _validate_and_load_data(target, data, evaluators, output_path, azure_ai_proj
277
290
 
278
291
  def _validate_columns(
279
292
  df: pd.DataFrame,
280
- evaluators: Dict[str, Any],
293
+ evaluators: Dict[str, Callable],
281
294
  target: Optional[Callable],
282
295
  column_mapping: Dict[str, Dict[str, str]],
283
296
  ) -> None:
@@ -287,7 +300,7 @@ def _validate_columns(
287
300
  :param df: The data frame to be validated.
288
301
  :type df: pd.DataFrame
289
302
  :param evaluators: The dictionary of evaluators.
290
- :type evaluators: Dict[str, Any]
303
+ :type evaluators: Dict[str, Callable]
291
304
  :param target: The callable to be applied to data set.
292
305
  :type target: Optional[Callable]
293
306
  :param column_mapping: Dictionary mapping evaluator name to evaluator column mapping
@@ -326,7 +339,7 @@ def _apply_target_to_data(
326
339
  initial_data: pd.DataFrame,
327
340
  evaluation_name: Optional[str] = None,
328
341
  _run_name: Optional[str] = None,
329
- ) -> Tuple[pd.DataFrame, Set[str]]:
342
+ ) -> Tuple[pd.DataFrame, Set[str], Run]:
330
343
  """
331
344
  Apply the target function to the data set and return updated data and generated columns.
332
345
 
@@ -348,15 +361,15 @@ def _apply_target_to_data(
348
361
  # We are manually creating the temporary directory for the flow
349
362
  # because the way tempdir remove temporary directories will
350
363
  # hang the debugger, because promptflow will keep flow directory.
351
- run = pf_client.run(
364
+ run: Run = pf_client.run(
352
365
  flow=target,
353
366
  display_name=evaluation_name,
354
367
  data=data,
355
- properties={"runType": "eval_run", "isEvaluatorRun": "true"},
368
+ properties={EvaluationRunProperties.RUN_TYPE: "eval_run", "isEvaluatorRun": "true"},
356
369
  stream=True,
357
370
  name=_run_name,
358
371
  )
359
- target_output = pf_client.runs.get_details(run, all_results=True)
372
+ target_output: pd.DataFrame = pf_client.runs.get_details(run, all_results=True)
360
373
  # Remove input and output prefix
361
374
  generated_columns = {
362
375
  col[len(Prefixes.OUTPUTS) :] for col in target_output.columns if col.startswith(Prefixes.OUTPUTS)
@@ -378,16 +391,18 @@ def _apply_target_to_data(
378
391
  return target_output, generated_columns, run
379
392
 
380
393
 
381
- def _process_column_mappings(column_mapping: Dict[str, Dict[str, str]]) -> Dict[str, Dict[str, str]]:
394
+ def _process_column_mappings(
395
+ column_mapping: Dict[str, Optional[Dict[str, str]]],
396
+ ) -> Dict[str, Dict[str, str]]:
382
397
  """Process column_mapping to replace ${target.} with ${data.}
383
398
 
384
399
  :param column_mapping: The configuration for evaluators.
385
- :type column_mapping: Dict[str, Dict[str, str]]
400
+ :type column_mapping: Dict[str, Optional[Dict[str, str]]]
386
401
  :return: The processed configuration.
387
402
  :rtype: Dict[str, Dict[str, str]]
388
403
  """
389
404
 
390
- processed_config = {}
405
+ processed_config: Dict[str, Dict[str, str]] = {}
391
406
 
392
407
  unexpected_references = re.compile(r"\${(?!target\.|data\.).+?}")
393
408
 
@@ -554,41 +569,69 @@ def evaluate(
554
569
  raise e
555
570
 
556
571
 
572
+ def _print_summary(per_evaluator_results: Dict[str, Any]) -> None:
573
+ # Extract evaluators with a non-empty "run_summary"
574
+ output_dict = {
575
+ name: result["run_summary"] for name, result in per_evaluator_results.items() if result.get("run_summary")
576
+ }
577
+
578
+ if output_dict:
579
+ print("======= Combined Run Summary (Per Evaluator) =======\n")
580
+ print(json.dumps(output_dict, indent=4))
581
+ print("\n====================================================")
582
+
583
+
557
584
  def _evaluate( # pylint: disable=too-many-locals,too-many-statements
558
585
  *,
586
+ evaluators: Dict[str, Callable],
559
587
  evaluation_name: Optional[str] = None,
560
588
  target: Optional[Callable] = None,
561
- data: Optional[str] = None,
562
- evaluators: Optional[Dict[str, Callable]] = None,
589
+ data: str,
563
590
  evaluator_config: Optional[Dict[str, EvaluatorConfig]] = None,
564
591
  azure_ai_project: Optional[AzureAIProject] = None,
565
592
  output_path: Optional[str] = None,
566
593
  **kwargs,
567
- ):
594
+ ) -> EvaluateResult:
568
595
  input_data_df = _validate_and_load_data(target, data, evaluators, output_path, azure_ai_project, evaluation_name)
569
596
 
570
597
  # Process evaluator config to replace ${target.} with ${data.}
571
598
  if evaluator_config is None:
572
599
  evaluator_config = {}
573
600
  # extract column mapping dicts into dictionary mapping evaluator name to column mapping
574
- column_mapping = {
575
- evaluator_name: evaluator_configuration.get("column_mapping", None)
576
- for evaluator_name, evaluator_configuration in evaluator_config.items()
577
- }
578
- column_mapping = _process_column_mappings(column_mapping)
601
+ column_mapping = _process_column_mappings(
602
+ {
603
+ evaluator_name: evaluator_configuration.get("column_mapping", None)
604
+ for evaluator_name, evaluator_configuration in evaluator_config.items()
605
+ }
606
+ )
579
607
  _validate_columns(input_data_df, evaluators, target, column_mapping)
580
608
 
581
609
  # Target Run
582
- pf_client = PFClient(
583
- config=(
584
- {"trace.destination": _trace_destination_from_project_scope(azure_ai_project)} if azure_ai_project else None
585
- ),
586
- user_agent=USER_AGENT,
587
- )
610
+ try:
611
+ pf_client = PFClient(
612
+ config=(
613
+ {"trace.destination": _trace_destination_from_project_scope(azure_ai_project)}
614
+ if azure_ai_project
615
+ else None
616
+ ),
617
+ user_agent=USER_AGENT,
618
+ )
619
+ # pylint: disable=raise-missing-from
620
+ except MissingAzurePackage:
621
+ msg = (
622
+ "The required packages for remote tracking are missing.\n"
623
+ 'To resolve this, please install them by running "pip install azure-ai-evaluation[remote]".'
624
+ )
625
+
626
+ raise EvaluationException(
627
+ message=msg,
628
+ target=ErrorTarget.EVALUATE,
629
+ category=ErrorCategory.MISSING_PACKAGE,
630
+ blame=ErrorBlame.USER_ERROR,
631
+ )
588
632
 
589
- trace_destination = pf_client._config.get_trace_destination() # pylint: disable=protected-access
590
- target_run = None
591
- target_generated_columns = set()
633
+ trace_destination: Optional[str] = pf_client._config.get_trace_destination() # pylint: disable=protected-access
634
+ target_run: Optional[Run] = None
592
635
 
593
636
  # Create default configuration for evaluators that directly maps
594
637
  # input data names to keyword inputs of the same name in the evaluators.
@@ -627,45 +670,54 @@ def _evaluate( # pylint: disable=too-many-locals,too-many-statements
627
670
  # Also ignore columns that are already in config, since they've been covered by target mapping.
628
671
  if not col.startswith(Prefixes.TSG_OUTPUTS) and col not in column_mapping["default"].keys():
629
672
  column_mapping["default"][col] = f"${{data.{col}}}"
673
+
674
+ def eval_batch_run(
675
+ batch_run_client: TClient, *, data=Union[str, os.PathLike, pd.DataFrame]
676
+ ) -> Dict[str, __EvaluatorInfo]:
677
+ with BatchRunContext(batch_run_client):
678
+ runs = {
679
+ evaluator_name: batch_run_client.run(
680
+ flow=evaluator,
681
+ run=target_run,
682
+ evaluator_name=evaluator_name,
683
+ column_mapping=column_mapping.get(evaluator_name, column_mapping.get("default", None)),
684
+ data=data,
685
+ stream=True,
686
+ name=kwargs.get("_run_name"),
687
+ )
688
+ for evaluator_name, evaluator in evaluators.items()
689
+ }
690
+
691
+ # get_details needs to be called within BatchRunContext scope in order to have user agent populated
692
+ return {
693
+ evaluator_name: {
694
+ "result": batch_run_client.get_details(run, all_results=True),
695
+ "metrics": batch_run_client.get_metrics(run),
696
+ "run_summary": batch_run_client.get_run_summary(run),
697
+ }
698
+ for evaluator_name, run in runs.items()
699
+ }
700
+
630
701
  # Batch Run
631
- evaluators_info = {}
632
702
  use_pf_client = kwargs.get("_use_pf_client", True)
633
703
  if use_pf_client:
634
- # A user reported intermittent errors when PFClient uploads evaluation runs to the cloud.
635
- # The root cause is still unclear, but it seems related to a conflict between the async run uploader
636
- # and the async batch run. As a quick mitigation, use a PFClient without a trace destination for batch runs.
637
- batch_run_client = ProxyClient(PFClient(user_agent=USER_AGENT))
638
-
639
704
  # Ensure the absolute path is passed to pf.run, as relative path doesn't work with
640
705
  # multiple evaluators. If the path is already absolute, abspath will return the original path.
641
706
  data = os.path.abspath(data)
707
+
708
+ # A user reported intermittent errors when PFClient uploads evaluation runs to the cloud.
709
+ # The root cause is still unclear, but it seems related to a conflict between the async run uploader
710
+ # and the async batch run. As a quick mitigation, use a PFClient without a trace destination for batch runs.
711
+ per_evaluator_results = eval_batch_run(ProxyClient(PFClient(user_agent=USER_AGENT)), data=data)
642
712
  else:
643
- batch_run_client = CodeClient()
644
713
  data = input_data_df
645
-
646
- with BatchRunContext(batch_run_client):
647
- for evaluator_name, evaluator in evaluators.items():
648
- evaluators_info[evaluator_name] = {}
649
- evaluators_info[evaluator_name]["run"] = batch_run_client.run(
650
- flow=evaluator,
651
- run=target_run,
652
- evaluator_name=evaluator_name,
653
- column_mapping=column_mapping.get(evaluator_name, column_mapping.get("default", None)),
654
- data=data,
655
- stream=True,
656
- name=kwargs.get("_run_name"),
657
- )
658
-
659
- # get_details needs to be called within BatchRunContext scope in order to have user agent populated
660
- for evaluator_name, evaluator_info in evaluators_info.items():
661
- evaluator_info["result"] = batch_run_client.get_details(evaluator_info["run"], all_results=True)
662
- evaluator_info["metrics"] = batch_run_client.get_metrics(evaluator_info["run"])
714
+ per_evaluator_results = eval_batch_run(CodeClient(), data=input_data_df)
663
715
 
664
716
  # Concatenate all results
665
717
  evaluators_result_df = None
666
718
  evaluators_metric = {}
667
- for evaluator_name, evaluator_info in evaluators_info.items():
668
- evaluator_result_df = evaluator_info["result"]
719
+ for evaluator_name, evaluator_result in per_evaluator_results.items():
720
+ evaluator_result_df = evaluator_result["result"]
669
721
 
670
722
  # drop input columns
671
723
  evaluator_result_df = evaluator_result_df.drop(
@@ -688,7 +740,7 @@ def _evaluate( # pylint: disable=too-many-locals,too-many-statements
688
740
  else evaluator_result_df
689
741
  )
690
742
 
691
- evaluators_metric.update({f"{evaluator_name}.{k}": v for k, v in evaluator_info["metrics"].items()})
743
+ evaluators_metric.update({f"{evaluator_name}.{k}": v for k, v in evaluator_result["metrics"].items()})
692
744
 
693
745
  # Rename columns, generated by target function to outputs instead of inputs.
694
746
  # If target generates columns, already present in the input data, these columns
@@ -706,9 +758,11 @@ def _evaluate( # pylint: disable=too-many-locals,too-many-statements
706
758
  evaluation_name,
707
759
  )
708
760
 
709
- result = {"rows": result_df.to_dict("records"), "metrics": metrics, "studio_url": studio_url}
761
+ result: EvaluateResult = {"rows": result_df.to_dict("records"), "metrics": metrics, "studio_url": studio_url}
710
762
 
711
763
  if output_path:
712
764
  _write_output(output_path, result)
713
765
 
766
+ _print_summary(per_evaluator_results)
767
+
714
768
  return result