azure-ai-evaluation 1.0.0b2__py3-none-any.whl → 1.0.0b4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of azure-ai-evaluation might be problematic. Click here for more details.

Files changed (78) hide show
  1. azure/ai/evaluation/__init__.py +9 -5
  2. azure/ai/evaluation/_common/constants.py +4 -2
  3. azure/ai/evaluation/_common/math.py +18 -0
  4. azure/ai/evaluation/_common/rai_service.py +54 -62
  5. azure/ai/evaluation/_common/utils.py +201 -16
  6. azure/ai/evaluation/_constants.py +12 -0
  7. azure/ai/evaluation/_evaluate/_batch_run_client/batch_run_context.py +10 -3
  8. azure/ai/evaluation/_evaluate/_batch_run_client/code_client.py +33 -17
  9. azure/ai/evaluation/_evaluate/_batch_run_client/proxy_client.py +17 -2
  10. azure/ai/evaluation/_evaluate/_eval_run.py +26 -10
  11. azure/ai/evaluation/_evaluate/_evaluate.py +161 -89
  12. azure/ai/evaluation/_evaluate/_telemetry/__init__.py +16 -17
  13. azure/ai/evaluation/_evaluate/_utils.py +44 -25
  14. azure/ai/evaluation/_evaluators/_coherence/_coherence.py +33 -79
  15. azure/ai/evaluation/_evaluators/_coherence/coherence.prompty +0 -5
  16. azure/ai/evaluation/_evaluators/_common/__init__.py +13 -0
  17. azure/ai/evaluation/_evaluators/_common/_base_eval.py +331 -0
  18. azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +76 -0
  19. azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py +97 -0
  20. azure/ai/evaluation/_evaluators/_content_safety/__init__.py +0 -4
  21. azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py +15 -20
  22. azure/ai/evaluation/_evaluators/_content_safety/_content_safety_chat.py +63 -42
  23. azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py +18 -41
  24. azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py +18 -39
  25. azure/ai/evaluation/_evaluators/_content_safety/_sexual.py +18 -39
  26. azure/ai/evaluation/_evaluators/_content_safety/_violence.py +18 -39
  27. azure/ai/evaluation/_evaluators/_eci/_eci.py +18 -55
  28. azure/ai/evaluation/_evaluators/_f1_score/_f1_score.py +14 -6
  29. azure/ai/evaluation/_evaluators/_fluency/_fluency.py +30 -74
  30. azure/ai/evaluation/_evaluators/_fluency/fluency.prompty +0 -5
  31. azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +34 -80
  32. azure/ai/evaluation/_evaluators/_groundedness/groundedness.prompty +0 -5
  33. azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py +18 -65
  34. azure/ai/evaluation/_evaluators/_qa/_qa.py +4 -3
  35. azure/ai/evaluation/_evaluators/_relevance/_relevance.py +35 -83
  36. azure/ai/evaluation/_evaluators/_relevance/relevance.prompty +0 -5
  37. azure/ai/evaluation/_evaluators/{_chat → _retrieval}/__init__.py +2 -2
  38. azure/ai/evaluation/_evaluators/{_chat/retrieval → _retrieval}/_retrieval.py +25 -28
  39. azure/ai/evaluation/_evaluators/{_chat/retrieval → _retrieval}/retrieval.prompty +0 -5
  40. azure/ai/evaluation/_evaluators/_rouge/_rouge.py +1 -1
  41. azure/ai/evaluation/_evaluators/_similarity/_similarity.py +23 -17
  42. azure/ai/evaluation/_evaluators/_similarity/similarity.prompty +0 -5
  43. azure/ai/evaluation/_evaluators/_xpia/xpia.py +15 -90
  44. azure/ai/evaluation/_exceptions.py +9 -7
  45. azure/ai/evaluation/_http_utils.py +203 -132
  46. azure/ai/evaluation/_model_configurations.py +37 -9
  47. azure/ai/evaluation/{_evaluators/_chat/retrieval → _vendor}/__init__.py +0 -6
  48. azure/ai/evaluation/_vendor/rouge_score/__init__.py +14 -0
  49. azure/ai/evaluation/_vendor/rouge_score/rouge_scorer.py +328 -0
  50. azure/ai/evaluation/_vendor/rouge_score/scoring.py +63 -0
  51. azure/ai/evaluation/_vendor/rouge_score/tokenize.py +63 -0
  52. azure/ai/evaluation/_vendor/rouge_score/tokenizers.py +53 -0
  53. azure/ai/evaluation/_version.py +1 -1
  54. azure/ai/evaluation/simulator/_adversarial_simulator.py +85 -60
  55. azure/ai/evaluation/simulator/_conversation/__init__.py +13 -12
  56. azure/ai/evaluation/simulator/_conversation/_conversation.py +4 -4
  57. azure/ai/evaluation/simulator/_direct_attack_simulator.py +24 -66
  58. azure/ai/evaluation/simulator/_helpers/_experimental.py +20 -9
  59. azure/ai/evaluation/simulator/_helpers/_simulator_data_classes.py +4 -4
  60. azure/ai/evaluation/simulator/_indirect_attack_simulator.py +22 -64
  61. azure/ai/evaluation/simulator/_model_tools/_identity_manager.py +67 -21
  62. azure/ai/evaluation/simulator/_model_tools/_proxy_completion_model.py +28 -11
  63. azure/ai/evaluation/simulator/_model_tools/_template_handler.py +68 -24
  64. azure/ai/evaluation/simulator/_model_tools/models.py +10 -10
  65. azure/ai/evaluation/simulator/_prompty/task_query_response.prompty +2 -6
  66. azure/ai/evaluation/simulator/_prompty/task_simulate.prompty +0 -4
  67. azure/ai/evaluation/simulator/_simulator.py +127 -117
  68. azure/ai/evaluation/simulator/_tracing.py +4 -4
  69. {azure_ai_evaluation-1.0.0b2.dist-info → azure_ai_evaluation-1.0.0b4.dist-info}/METADATA +129 -43
  70. azure_ai_evaluation-1.0.0b4.dist-info/NOTICE.txt +50 -0
  71. azure_ai_evaluation-1.0.0b4.dist-info/RECORD +106 -0
  72. azure/ai/evaluation/_evaluators/_chat/_chat.py +0 -357
  73. azure/ai/evaluation/_evaluators/_content_safety/_content_safety_base.py +0 -65
  74. azure/ai/evaluation/_evaluators/_protected_materials/__init__.py +0 -5
  75. azure/ai/evaluation/_evaluators/_protected_materials/_protected_materials.py +0 -104
  76. azure_ai_evaluation-1.0.0b2.dist-info/RECORD +0 -99
  77. {azure_ai_evaluation-1.0.0b2.dist-info → azure_ai_evaluation-1.0.0b4.dist-info}/WHEEL +0 -0
  78. {azure_ai_evaluation-1.0.0b2.dist-info → azure_ai_evaluation-1.0.0b4.dist-info}/top_level.txt +0 -0
@@ -2,6 +2,8 @@
2
2
  # Copyright (c) Microsoft Corporation. All rights reserved.
3
3
  # ---------------------------------------------------------
4
4
  import os
5
+ import types
6
+ from typing import Optional, Type, Union
5
7
 
6
8
  from promptflow._sdk._constants import PF_FLOW_ENTRY_IN_TMP, PF_FLOW_META_LOAD_IN_SUBPROCESS
7
9
  from promptflow._utils.user_agent_utils import ClientUserAgentUtil
@@ -30,12 +32,12 @@ class BatchRunContext:
30
32
  ]
31
33
  """
32
34
 
33
- def __init__(self, client) -> None:
35
+ def __init__(self, client: Union[CodeClient, ProxyClient]) -> None:
34
36
  self.client = client
35
37
  self._is_batch_timeout_set_by_system = False
36
38
  self._is_otel_timeout_set_by_system = False
37
39
 
38
- def __enter__(self):
40
+ def __enter__(self) -> None:
39
41
  if isinstance(self.client, CodeClient):
40
42
  ClientUserAgentUtil.append_user_agent(USER_AGENT)
41
43
  inject_openai_api()
@@ -56,7 +58,12 @@ class BatchRunContext:
56
58
  # For addressing the issue of asyncio event loop closed on Windows
57
59
  set_event_loop_policy()
58
60
 
59
- def __exit__(self, exc_type, exc_val, exc_tb):
61
+ def __exit__(
62
+ self,
63
+ exc_type: Optional[Type[BaseException]],
64
+ exc_value: Optional[BaseException],
65
+ exc_tb: Optional[types.TracebackType],
66
+ ) -> None:
60
67
  if isinstance(self.client, CodeClient):
61
68
  recover_openai_api()
62
69
 
@@ -5,8 +5,9 @@ import inspect
5
5
  import json
6
6
  import logging
7
7
  import os
8
+ from concurrent.futures import Future
8
9
  from pathlib import Path
9
- from typing import Callable, Dict, Optional, Union
10
+ from typing import Any, Callable, Dict, Optional, Union, cast
10
11
 
11
12
  import pandas as pd
12
13
  from promptflow.contracts.types import AttrDict
@@ -22,25 +23,31 @@ LOGGER = logging.getLogger(__name__)
22
23
 
23
24
  class CodeRun:
24
25
  def __init__(
25
- self, run, input_data, evaluator_name=None, aggregated_metrics=None, **kwargs # pylint: disable=unused-argument
26
- ):
26
+ self,
27
+ *,
28
+ run: Future,
29
+ input_data,
30
+ evaluator_name: Optional[str] = None,
31
+ aggregator: Callable[["CodeRun"], Future],
32
+ **kwargs, # pylint: disable=unused-argument
33
+ ) -> None:
27
34
  self.run = run
28
35
  self.evaluator_name = evaluator_name if evaluator_name is not None else ""
29
36
  self.input_data = input_data
30
- self.aggregated_metrics = aggregated_metrics
37
+ self.aggregated_metrics = aggregator(self)
31
38
 
32
- def get_result_df(self, exclude_inputs=False):
39
+ def get_result_df(self, exclude_inputs: bool = False) -> pd.DataFrame:
33
40
  batch_run_timeout = get_int_env_var(PF_BATCH_TIMEOUT_SEC, PF_BATCH_TIMEOUT_SEC_DEFAULT)
34
- result_df = self.run.result(timeout=batch_run_timeout)
41
+ result_df = cast(pd.DataFrame, self.run.result(timeout=batch_run_timeout))
35
42
  if exclude_inputs:
36
43
  result_df = result_df.drop(columns=[col for col in result_df.columns if col.startswith("inputs.")])
37
44
  return result_df
38
45
 
39
- def get_aggregated_metrics(self):
46
+ def get_aggregated_metrics(self) -> Dict[str, Any]:
40
47
  try:
41
48
  batch_run_timeout = get_int_env_var(PF_BATCH_TIMEOUT_SEC, PF_BATCH_TIMEOUT_SEC_DEFAULT)
42
- aggregated_metrics = (
43
- self.aggregated_metrics.result(timeout=batch_run_timeout)
49
+ aggregated_metrics: Optional[Any] = (
50
+ cast(Dict, self.aggregated_metrics.result(timeout=batch_run_timeout))
44
51
  if self.aggregated_metrics is not None
45
52
  else None
46
53
  )
@@ -104,10 +111,10 @@ class CodeClient: # pylint: disable=client-accepts-api-version-keyword
104
111
  verify_integrity=True,
105
112
  )
106
113
 
107
- def _calculate_aggregations(self, evaluator, run):
114
+ @staticmethod
115
+ def _calculate_aggregations(evaluator: Callable, run: CodeRun) -> Any:
108
116
  try:
109
117
  if _has_aggregator(evaluator):
110
- aggregate_input = None
111
118
  evaluator_output = run.get_result_df(exclude_inputs=True)
112
119
  if len(evaluator_output.columns) == 1 and evaluator_output.columns[0] == "output":
113
120
  aggregate_input = evaluator_output["output"].tolist()
@@ -152,21 +159,30 @@ class CodeClient: # pylint: disable=client-accepts-api-version-keyword
152
159
  column_mapping=column_mapping,
153
160
  evaluator_name=evaluator_name,
154
161
  )
155
- run = CodeRun(run=eval_future, input_data=data, evaluator_name=evaluator_name, aggregated_metrics=None)
156
- aggregation_future = self._thread_pool.submit(self._calculate_aggregations, evaluator=flow, run=run)
157
- run.aggregated_metrics = aggregation_future
158
- return run
162
+
163
+ return CodeRun(
164
+ run=eval_future,
165
+ input_data=data,
166
+ evaluator_name=evaluator_name,
167
+ aggregator=lambda code_run: self._thread_pool.submit(
168
+ self._calculate_aggregations, evaluator=flow, run=code_run
169
+ ),
170
+ )
159
171
 
160
172
  def get_details(self, run: CodeRun, all_results: bool = False) -> pd.DataFrame:
161
173
  result_df = run.get_result_df(exclude_inputs=not all_results)
162
174
  return result_df
163
175
 
164
- def get_metrics(self, run: CodeRun) -> Optional[None]:
176
+ def get_metrics(self, run: CodeRun) -> Dict[str, Any]:
165
177
  try:
166
178
  aggregated_metrics = run.get_aggregated_metrics()
167
179
  print("Aggregated metrics")
168
180
  print(aggregated_metrics)
169
181
  except Exception as ex: # pylint: disable=broad-exception-caught
170
182
  LOGGER.debug("Error calculating metrics for evaluator %s, failed with error %s", run.evaluator_name, ex)
171
- return None
183
+ return {}
172
184
  return aggregated_metrics
185
+
186
+ def get_run_summary(self, run: CodeRun) -> Any: # pylint: disable=unused-argument
187
+ # Not implemented
188
+ return None
@@ -3,11 +3,12 @@
3
3
  # ---------------------------------------------------------
4
4
  import inspect
5
5
  import logging
6
+ import math
6
7
  import os
7
8
  from concurrent.futures import Future
8
9
  from typing import Any, Callable, Dict, Optional, Union
10
+ from collections import OrderedDict
9
11
 
10
- import numpy as np
11
12
  import pandas as pd
12
13
  from promptflow.client import PFClient
13
14
  from promptflow.entities import Run
@@ -53,13 +54,27 @@ class ProxyClient: # pylint: disable=client-accepts-api-version-keyword
53
54
  def get_details(self, proxy_run: ProxyRun, all_results: bool = False) -> pd.DataFrame:
54
55
  run: Run = proxy_run.run.result()
55
56
  result_df = self._pf_client.get_details(run, all_results=all_results)
56
- result_df.replace("(Failed)", np.nan, inplace=True)
57
+ result_df.replace("(Failed)", math.nan, inplace=True)
57
58
  return result_df
58
59
 
59
60
  def get_metrics(self, proxy_run: ProxyRun) -> Dict[str, Any]:
60
61
  run: Run = proxy_run.run.result()
61
62
  return self._pf_client.get_metrics(run)
62
63
 
64
+ def get_run_summary(self, proxy_run: ProxyRun) -> Dict[str, Any]:
65
+ run = proxy_run.run.result()
66
+
67
+ # pylint: disable=protected-access
68
+ return OrderedDict(
69
+ [
70
+ ("status", run.status),
71
+ ("duration", str(run._end_time - run._created_on)),
72
+ ("completed_lines", run._properties.get("system_metrics", {}).get("__pf__.lines.completed", "NA")),
73
+ ("failed_lines", run._properties.get("system_metrics", {}).get("__pf__.lines.failed", "NA")),
74
+ ("log_path", str(run._output_path)),
75
+ ]
76
+ )
77
+
63
78
  @staticmethod
64
79
  def _should_batch_use_async(flow):
65
80
  if os.getenv("PF_EVALS_BATCH_USE_ASYNC", "true").lower() == "true":
@@ -10,10 +10,11 @@ import posixpath
10
10
  import time
11
11
  import types
12
12
  import uuid
13
- from typing import Any, Dict, Optional, Set, Type
13
+ from typing import Any, Dict, List, Optional, Set, Type
14
14
  from urllib.parse import urlparse
15
15
 
16
16
  from promptflow._sdk.entities import Run
17
+ from typing_extensions import Self
17
18
 
18
19
  from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
19
20
  from azure.ai.evaluation._http_utils import get_http_client
@@ -27,6 +28,7 @@ LOGGER = logging.getLogger(__name__)
27
28
  # Handle optional import. The azure libraries are only present if
28
29
  # promptflow-azure is installed.
29
30
  try:
31
+ from azure.ai.ml import MLClient
30
32
  from azure.ai.ml.entities._credentials import AccountKeyConfiguration # pylint: disable=ungrouped-imports
31
33
  from azure.ai.ml.entities._datastore.datastore import Datastore
32
34
  from azure.storage.blob import BlobServiceClient
@@ -121,8 +123,8 @@ class EvalRun(contextlib.AbstractContextManager): # pylint: disable=too-many-in
121
123
  self._run_name = run_name
122
124
  self._promptflow_run = promptflow_run
123
125
  self._status = RunStatus.NOT_STARTED
124
- self._url_base = None
125
- self.info = None
126
+ self._url_base: Optional[str] = None
127
+ self._info: Optional[RunInfo] = None
126
128
 
127
129
  @property
128
130
  def status(self) -> RunStatus:
@@ -134,6 +136,20 @@ class EvalRun(contextlib.AbstractContextManager): # pylint: disable=too-many-in
134
136
  """
135
137
  return self._status
136
138
 
139
+ @property
140
+ def info(self) -> RunInfo:
141
+ if self._info is None:
142
+ msg = "Run info is missing"
143
+ raise EvaluationException(
144
+ message=msg,
145
+ internal_message=msg,
146
+ target=ErrorTarget.EVAL_RUN,
147
+ category=ErrorCategory.UNKNOWN,
148
+ blame=ErrorBlame.UNKNOWN,
149
+ )
150
+
151
+ return self._info
152
+
137
153
  def _get_scope(self) -> str:
138
154
  """
139
155
  Return the scope information for the workspace.
@@ -161,11 +177,11 @@ class EvalRun(contextlib.AbstractContextManager): # pylint: disable=too-many-in
161
177
  )
162
178
  self._url_base = None
163
179
  self._status = RunStatus.BROKEN
164
- self.info = RunInfo.generate(self._run_name)
180
+ self._info = RunInfo.generate(self._run_name)
165
181
  else:
166
182
  self._url_base = urlparse(self._tracking_uri).netloc
167
183
  if self._promptflow_run is not None:
168
- self.info = RunInfo(
184
+ self._info = RunInfo(
169
185
  self._promptflow_run.name,
170
186
  self._promptflow_run._experiment_name, # pylint: disable=protected-access
171
187
  self._promptflow_run.name,
@@ -182,7 +198,7 @@ class EvalRun(contextlib.AbstractContextManager): # pylint: disable=too-many-in
182
198
  body["run_name"] = self._run_name
183
199
  response = self.request_with_retry(url=url, method="POST", json_dict=body)
184
200
  if response.status_code != 200:
185
- self.info = RunInfo.generate(self._run_name)
201
+ self._info = RunInfo.generate(self._run_name)
186
202
  LOGGER.warning(
187
203
  "The run failed to start: %s: %s."
188
204
  "The results will be saved locally, but will not be logged to Azure.",
@@ -192,7 +208,7 @@ class EvalRun(contextlib.AbstractContextManager): # pylint: disable=too-many-in
192
208
  self._status = RunStatus.BROKEN
193
209
  else:
194
210
  parsed_response = response.json()
195
- self.info = RunInfo(
211
+ self._info = RunInfo(
196
212
  run_id=parsed_response["run"]["info"]["run_id"],
197
213
  experiment_id=parsed_response["run"]["info"]["experiment_id"],
198
214
  run_name=parsed_response["run"]["info"]["run_name"],
@@ -235,7 +251,7 @@ class EvalRun(contextlib.AbstractContextManager): # pylint: disable=too-many-in
235
251
  LOGGER.warning("Unable to terminate the run.")
236
252
  self._status = RunStatus.TERMINATED
237
253
 
238
- def __enter__(self):
254
+ def __enter__(self) -> Self:
239
255
  """The Context Manager enter call.
240
256
 
241
257
  :return: The instance of the class.
@@ -249,7 +265,7 @@ class EvalRun(contextlib.AbstractContextManager): # pylint: disable=too-many-in
249
265
  exc_type: Optional[Type[BaseException]],
250
266
  exc_value: Optional[BaseException],
251
267
  exc_tb: Optional[types.TracebackType],
252
- ) -> Optional[bool]:
268
+ ) -> None:
253
269
  """The context manager exit call.
254
270
 
255
271
  :param exc_type: The exception type
@@ -408,7 +424,7 @@ class EvalRun(contextlib.AbstractContextManager): # pylint: disable=too-many-in
408
424
  return
409
425
  # First we will list the files and the appropriate remote paths for them.
410
426
  root_upload_path = posixpath.join("promptflow", "PromptFlowArtifacts", self.info.run_name)
411
- remote_paths = {"paths": []}
427
+ remote_paths: Dict[str, List[Dict[str, str]]] = {"paths": []}
412
428
  local_paths = []
413
429
  # Go over the artifact folder and upload all artifacts.
414
430
  for root, _, filenames in os.walk(artifact_folder):