azure-ai-evaluation 1.0.0b3__py3-none-any.whl → 1.0.0b5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of azure-ai-evaluation might be problematic. Click here for more details.

Files changed (93) hide show
  1. azure/ai/evaluation/__init__.py +23 -1
  2. azure/ai/evaluation/{simulator/_helpers → _common}/_experimental.py +20 -9
  3. azure/ai/evaluation/_common/constants.py +9 -2
  4. azure/ai/evaluation/_common/math.py +29 -0
  5. azure/ai/evaluation/_common/rai_service.py +222 -93
  6. azure/ai/evaluation/_common/utils.py +328 -19
  7. azure/ai/evaluation/_constants.py +16 -8
  8. azure/ai/evaluation/_evaluate/{_batch_run_client → _batch_run}/__init__.py +3 -2
  9. azure/ai/evaluation/_evaluate/{_batch_run_client → _batch_run}/code_client.py +33 -17
  10. azure/ai/evaluation/_evaluate/{_batch_run_client/batch_run_context.py → _batch_run/eval_run_context.py} +14 -7
  11. azure/ai/evaluation/_evaluate/{_batch_run_client → _batch_run}/proxy_client.py +22 -4
  12. azure/ai/evaluation/_evaluate/_batch_run/target_run_context.py +35 -0
  13. azure/ai/evaluation/_evaluate/_eval_run.py +47 -14
  14. azure/ai/evaluation/_evaluate/_evaluate.py +370 -188
  15. azure/ai/evaluation/_evaluate/_telemetry/__init__.py +15 -16
  16. azure/ai/evaluation/_evaluate/_utils.py +77 -25
  17. azure/ai/evaluation/_evaluators/_bleu/_bleu.py +1 -1
  18. azure/ai/evaluation/_evaluators/_coherence/_coherence.py +16 -10
  19. azure/ai/evaluation/_evaluators/_coherence/coherence.prompty +76 -34
  20. azure/ai/evaluation/_evaluators/_common/_base_eval.py +76 -46
  21. azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +26 -19
  22. azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py +62 -25
  23. azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py +68 -36
  24. azure/ai/evaluation/_evaluators/_content_safety/_content_safety_chat.py +67 -46
  25. azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py +33 -4
  26. azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py +33 -4
  27. azure/ai/evaluation/_evaluators/_content_safety/_sexual.py +33 -4
  28. azure/ai/evaluation/_evaluators/_content_safety/_violence.py +33 -4
  29. azure/ai/evaluation/_evaluators/_eci/_eci.py +7 -5
  30. azure/ai/evaluation/_evaluators/_f1_score/_f1_score.py +14 -6
  31. azure/ai/evaluation/_evaluators/_fluency/_fluency.py +22 -21
  32. azure/ai/evaluation/_evaluators/_fluency/fluency.prompty +66 -36
  33. azure/ai/evaluation/_evaluators/_gleu/_gleu.py +1 -1
  34. azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +51 -16
  35. azure/ai/evaluation/_evaluators/_groundedness/groundedness_with_query.prompty +113 -0
  36. azure/ai/evaluation/_evaluators/_groundedness/groundedness_without_query.prompty +99 -0
  37. azure/ai/evaluation/_evaluators/_meteor/_meteor.py +3 -7
  38. azure/ai/evaluation/_evaluators/_multimodal/__init__.py +20 -0
  39. azure/ai/evaluation/_evaluators/_multimodal/_content_safety_multimodal.py +130 -0
  40. azure/ai/evaluation/_evaluators/_multimodal/_content_safety_multimodal_base.py +57 -0
  41. azure/ai/evaluation/_evaluators/_multimodal/_hate_unfairness.py +96 -0
  42. azure/ai/evaluation/_evaluators/_multimodal/_protected_material.py +120 -0
  43. azure/ai/evaluation/_evaluators/_multimodal/_self_harm.py +96 -0
  44. azure/ai/evaluation/_evaluators/_multimodal/_sexual.py +96 -0
  45. azure/ai/evaluation/_evaluators/_multimodal/_violence.py +96 -0
  46. azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py +46 -13
  47. azure/ai/evaluation/_evaluators/_qa/_qa.py +11 -6
  48. azure/ai/evaluation/_evaluators/_relevance/_relevance.py +23 -20
  49. azure/ai/evaluation/_evaluators/_relevance/relevance.prompty +78 -42
  50. azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py +126 -80
  51. azure/ai/evaluation/_evaluators/_retrieval/retrieval.prompty +74 -24
  52. azure/ai/evaluation/_evaluators/_rouge/_rouge.py +2 -2
  53. azure/ai/evaluation/_evaluators/_service_groundedness/__init__.py +9 -0
  54. azure/ai/evaluation/_evaluators/_service_groundedness/_service_groundedness.py +150 -0
  55. azure/ai/evaluation/_evaluators/_similarity/_similarity.py +32 -15
  56. azure/ai/evaluation/_evaluators/_xpia/xpia.py +36 -10
  57. azure/ai/evaluation/_exceptions.py +26 -6
  58. azure/ai/evaluation/_http_utils.py +203 -132
  59. azure/ai/evaluation/_model_configurations.py +23 -6
  60. azure/ai/evaluation/_vendor/__init__.py +3 -0
  61. azure/ai/evaluation/_vendor/rouge_score/__init__.py +14 -0
  62. azure/ai/evaluation/_vendor/rouge_score/rouge_scorer.py +328 -0
  63. azure/ai/evaluation/_vendor/rouge_score/scoring.py +63 -0
  64. azure/ai/evaluation/_vendor/rouge_score/tokenize.py +63 -0
  65. azure/ai/evaluation/_vendor/rouge_score/tokenizers.py +53 -0
  66. azure/ai/evaluation/_version.py +1 -1
  67. azure/ai/evaluation/simulator/__init__.py +2 -1
  68. azure/ai/evaluation/simulator/_adversarial_scenario.py +5 -0
  69. azure/ai/evaluation/simulator/_adversarial_simulator.py +88 -60
  70. azure/ai/evaluation/simulator/_conversation/__init__.py +13 -12
  71. azure/ai/evaluation/simulator/_conversation/_conversation.py +4 -4
  72. azure/ai/evaluation/simulator/_data_sources/__init__.py +3 -0
  73. azure/ai/evaluation/simulator/_data_sources/grounding.json +1150 -0
  74. azure/ai/evaluation/simulator/_direct_attack_simulator.py +24 -66
  75. azure/ai/evaluation/simulator/_helpers/__init__.py +1 -2
  76. azure/ai/evaluation/simulator/_helpers/_simulator_data_classes.py +26 -5
  77. azure/ai/evaluation/simulator/_indirect_attack_simulator.py +98 -95
  78. azure/ai/evaluation/simulator/_model_tools/_identity_manager.py +67 -21
  79. azure/ai/evaluation/simulator/_model_tools/_proxy_completion_model.py +28 -11
  80. azure/ai/evaluation/simulator/_model_tools/_template_handler.py +68 -24
  81. azure/ai/evaluation/simulator/_model_tools/models.py +10 -10
  82. azure/ai/evaluation/simulator/_prompty/task_query_response.prompty +4 -9
  83. azure/ai/evaluation/simulator/_prompty/task_simulate.prompty +6 -5
  84. azure/ai/evaluation/simulator/_simulator.py +222 -169
  85. azure/ai/evaluation/simulator/_tracing.py +4 -4
  86. azure/ai/evaluation/simulator/_utils.py +6 -6
  87. {azure_ai_evaluation-1.0.0b3.dist-info → azure_ai_evaluation-1.0.0b5.dist-info}/METADATA +237 -52
  88. azure_ai_evaluation-1.0.0b5.dist-info/NOTICE.txt +70 -0
  89. azure_ai_evaluation-1.0.0b5.dist-info/RECORD +120 -0
  90. {azure_ai_evaluation-1.0.0b3.dist-info → azure_ai_evaluation-1.0.0b5.dist-info}/WHEEL +1 -1
  91. azure/ai/evaluation/_evaluators/_groundedness/groundedness.prompty +0 -49
  92. azure_ai_evaluation-1.0.0b3.dist-info/RECORD +0 -98
  93. {azure_ai_evaluation-1.0.0b3.dist-info → azure_ai_evaluation-1.0.0b5.dist-info}/top_level.txt +0 -0
@@ -2,21 +2,30 @@
2
2
  # Copyright (c) Microsoft Corporation. All rights reserved.
3
3
  # ---------------------------------------------------------
4
4
 
5
+ import re
6
+ import math
5
7
  import threading
6
- from typing import List, Union
8
+ from typing import Any, List, Literal, Mapping, Type, TypeVar, Tuple, Union, cast, get_args, get_origin
7
9
 
8
10
  import nltk
9
- import numpy as np
10
-
11
- from azure.ai.evaluation._model_configurations import AzureOpenAIModelConfiguration, OpenAIModelConfiguration
11
+ from typing_extensions import NotRequired, Required, TypeGuard
12
+ from promptflow.core._errors import MissingRequiredPackage
12
13
  from azure.ai.evaluation._constants import AZURE_OPENAI_TYPE, OPENAI_TYPE
14
+ from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
15
+ from azure.ai.evaluation._model_configurations import (
16
+ AzureAIProject,
17
+ AzureOpenAIModelConfiguration,
18
+ OpenAIModelConfiguration,
19
+ )
13
20
 
14
21
  from . import constants
15
22
 
16
23
  _nltk_data_download_lock = threading.Lock()
17
24
 
25
+ T_TypedDict = TypeVar("T_TypedDict", bound=Mapping[Any, Any])
26
+
18
27
 
19
- def get_harm_severity_level(harm_score: int) -> str:
28
+ def get_harm_severity_level(harm_score: Union[float, int]) -> Union[str, float]:
20
29
  """Generate harm severity level based on harm score.
21
30
 
22
31
  :param harm_score: The harm score to be evaluated.
@@ -30,23 +39,29 @@ def get_harm_severity_level(harm_score: int) -> str:
30
39
  constants.HarmSeverityLevel.Medium: [4, 5],
31
40
  constants.HarmSeverityLevel.High: [6, 7],
32
41
  }
33
- if harm_score == np.nan or harm_score is None:
34
- return np.nan
42
+ if math.isnan(harm_score) or harm_score is None:
43
+ return math.nan
35
44
  for harm_level, harm_score_range in HARM_SEVERITY_LEVEL_MAPPING.items():
36
45
  if harm_score_range[0] <= harm_score <= harm_score_range[1]:
37
46
  return harm_level.value
38
- return np.nan
47
+ return math.nan
39
48
 
40
49
 
41
50
  def ensure_nltk_data_downloaded():
42
51
  """Download NLTK data packages if not already downloaded."""
52
+ nltk_data = [
53
+ ("wordnet", "corpora/wordnet.zip"),
54
+ ("perluniprops", "misc/perluniprops.zip"),
55
+ ("punkt", "tokenizers/punkt.zip"),
56
+ ("punkt_tab", "tokenizers/punkt_tab.zip"),
57
+ ]
58
+
43
59
  with _nltk_data_download_lock:
44
- try:
45
- from nltk.tokenize.nist import NISTTokenizer # pylint: disable=unused-import
46
- except LookupError:
47
- nltk.download("perluniprops")
48
- nltk.download("punkt")
49
- nltk.download("punkt_tab")
60
+ for _id, resource_name in nltk_data:
61
+ try:
62
+ nltk.find(resource_name)
63
+ except LookupError:
64
+ nltk.download(_id)
50
65
 
51
66
 
52
67
  def nltk_tokenize(text: str) -> List[str]:
@@ -71,12 +86,20 @@ def nltk_tokenize(text: str) -> List[str]:
71
86
  return list(tokens)
72
87
 
73
88
 
89
+ def _is_aoi_model_config(val: object) -> TypeGuard[AzureOpenAIModelConfiguration]:
90
+ return isinstance(val, dict) and all(isinstance(val.get(k), str) for k in ("azure_endpoint", "azure_deployment"))
91
+
92
+
93
+ def _is_openai_model_config(val: object) -> TypeGuard[OpenAIModelConfiguration]:
94
+ return isinstance(val, dict) and all(isinstance(val.get(k), str) for k in ("model"))
95
+
96
+
74
97
  def parse_model_config_type(
75
98
  model_config: Union[AzureOpenAIModelConfiguration, OpenAIModelConfiguration],
76
99
  ) -> None:
77
- if "azure_endpoint" in model_config or "azure_deployment" in model_config:
100
+ if _is_aoi_model_config(model_config):
78
101
  model_config["type"] = AZURE_OPENAI_TYPE
79
- else:
102
+ elif _is_openai_model_config(model_config):
80
103
  model_config["type"] = OPENAI_TYPE
81
104
 
82
105
 
@@ -87,16 +110,302 @@ def construct_prompty_model_config(
87
110
  ) -> dict:
88
111
  parse_model_config_type(model_config)
89
112
 
90
- if model_config["type"] == AZURE_OPENAI_TYPE:
113
+ if _is_aoi_model_config(model_config):
91
114
  model_config["api_version"] = model_config.get("api_version", default_api_version)
92
115
 
93
- prompty_model_config = {"configuration": model_config, "parameters": {"extra_headers": {}}}
116
+ prompty_model_config: dict = {"configuration": model_config, "parameters": {"extra_headers": {}}}
94
117
 
95
118
  # Handle "RuntimeError: Event loop is closed" from httpx AsyncClient
96
119
  # https://github.com/encode/httpx/discussions/2959
97
120
  prompty_model_config["parameters"]["extra_headers"].update({"Connection": "close"})
98
121
 
99
- if model_config["type"] == AZURE_OPENAI_TYPE and user_agent:
122
+ if _is_aoi_model_config(model_config) and user_agent:
100
123
  prompty_model_config["parameters"]["extra_headers"].update({"x-ms-useragent": user_agent})
101
124
 
102
125
  return prompty_model_config
126
+
127
+
128
+ def validate_azure_ai_project(o: object) -> AzureAIProject:
129
+ fields = {"subscription_id": str, "resource_group_name": str, "project_name": str}
130
+
131
+ if not isinstance(o, dict):
132
+ msg = "The 'azure_ai_project' parameter must be a dictionary."
133
+ raise EvaluationException(
134
+ message=msg,
135
+ category=ErrorCategory.INVALID_VALUE,
136
+ blame=ErrorBlame.USER_ERROR,
137
+ )
138
+
139
+ missing_fields = set(fields.keys()) - o.keys()
140
+
141
+ if missing_fields:
142
+ msg = (
143
+ "The 'azure_ai_project' dictionary is missing the following required "
144
+ f"field(s): {', '.join(f'{field}' for field in missing_fields)}."
145
+ )
146
+ raise EvaluationException(
147
+ message=msg,
148
+ category=ErrorCategory.INVALID_VALUE,
149
+ blame=ErrorBlame.USER_ERROR,
150
+ )
151
+
152
+ for field_name, expected_type in fields.items():
153
+ if isinstance(o[field_name], expected_type):
154
+ continue
155
+
156
+ msg = f"Invalid type for field '{field_name}'. Expected {expected_type}, but got {type(o[field_name])}."
157
+ raise EvaluationException(
158
+ message=msg,
159
+ category=ErrorCategory.INVALID_VALUE,
160
+ blame=ErrorBlame.USER_ERROR,
161
+ )
162
+
163
+ return cast(AzureAIProject, o)
164
+
165
+
166
+ def validate_model_config(config: dict) -> Union[AzureOpenAIModelConfiguration, OpenAIModelConfiguration]:
167
+ try:
168
+ return _validate_typed_dict(config, AzureOpenAIModelConfiguration)
169
+ except TypeError:
170
+ try:
171
+ return _validate_typed_dict(config, OpenAIModelConfiguration)
172
+ except TypeError as e:
173
+ msg = "Model config validation failed."
174
+ raise EvaluationException(
175
+ message=msg, internal_message=msg, category=ErrorCategory.MISSING_FIELD, blame=ErrorBlame.USER_ERROR
176
+ ) from e
177
+
178
+
179
+ def _validate_typed_dict(o: object, t: Type[T_TypedDict]) -> T_TypedDict:
180
+ """Do very basic runtime validation that an object is a typed dict
181
+
182
+ .. warning::
183
+
184
+ This validation is very basic, robust enough to cover some very simple TypedDicts.
185
+ Ideally, validation of this kind should be delegated to something more robust.
186
+
187
+ You will very quickly run into limitations trying to apply this function more broadly:
188
+ * Doesn't support stringized annotations at all
189
+ * Very limited support for generics, and "special form" (NoReturn, NotRequired, Required, etc...) types.
190
+ * Error messages are poor, especially if there is any nesting.
191
+
192
+ :param object o: The object to check
193
+ :param Type[T_TypedDict] t: The TypedDict to validate against
194
+ :raises NotImplementedError: Several forms of validation are unsupported
195
+ * Checking against stringized annotations
196
+ * Checking a generic that is not one of a few basic forms
197
+ :raises TypeError: If a value does not match the specified annotation
198
+ :raises ValueError: If t's annotation is not a string, type of a special form (e.g. NotRequired, Required, etc...)
199
+ :returns: The object passed in
200
+ :rtype: T_TypedDict
201
+ """
202
+ if not isinstance(o, dict):
203
+ raise TypeError(f"Expected type 'dict', got type '{type(object)}'.")
204
+
205
+ annotations = t.__annotations__
206
+ is_total = getattr(t, "__total__", False)
207
+ unknown_keys = set(o.keys()) - annotations.keys()
208
+
209
+ if unknown_keys:
210
+ raise TypeError(f"dict contains unknown keys: {list(unknown_keys)!r}")
211
+
212
+ required_keys = {
213
+ k
214
+ for k in annotations
215
+ if (is_total and get_origin(annotations[k]) is not NotRequired)
216
+ or (not is_total and get_origin(annotations[k]) is Required)
217
+ }
218
+
219
+ missing_keys = required_keys - o.keys()
220
+
221
+ if missing_keys:
222
+ raise TypeError(f"Missing required keys: {list(missing_keys)!r}.")
223
+
224
+ def validate_annotation(v: object, annotation: Union[str, type, object]) -> bool:
225
+ if isinstance(annotation, str):
226
+ raise NotImplementedError("Missing support for validating against stringized annotations.")
227
+
228
+ if (origin := get_origin(annotation)) is not None:
229
+ if origin is tuple:
230
+ validate_annotation(v, tuple)
231
+ tuple_args = get_args(annotation)
232
+ if len(cast(tuple, v)) != len(tuple_args):
233
+ raise TypeError(f"Expected a {len(tuple_args)}-tuple, got a {len(cast(tuple, v))}-tuple.")
234
+ for tuple_val, tuple_args in zip(cast(tuple, v), tuple_args):
235
+ validate_annotation(tuple_val, tuple_args)
236
+ elif origin is dict:
237
+ validate_annotation(v, dict)
238
+ dict_key_ann, dict_val_ann = get_args(annotation)
239
+ for dict_key, dict_val in cast(dict, v).items():
240
+ validate_annotation(dict_val, dict_val_ann)
241
+ validate_annotation(dict_key, dict_key_ann)
242
+ elif origin is list:
243
+ validate_annotation(v, list)
244
+ list_val_ann = get_args(annotation)[0]
245
+ for list_val in cast(list, v):
246
+ validate_annotation(list_val, list_val_ann)
247
+ elif origin is Union:
248
+ for generic_arg in get_args(annotation):
249
+ try:
250
+ validate_annotation(v, generic_arg)
251
+ return True
252
+ except TypeError:
253
+ pass
254
+ raise TypeError(f"Expected value to have type {annotation}. Received type {type(v)}")
255
+ elif origin is Literal:
256
+ literal_args = get_args(annotation)
257
+ if not any(type(literal) is type(v) and literal == v for literal in literal_args):
258
+ raise TypeError(f"Expected value to be one of {list(literal_args)!r}. Received type {type(v)}")
259
+ elif any(origin is g for g in (NotRequired, Required)):
260
+ validate_annotation(v, get_args(annotation)[0])
261
+ else:
262
+ raise NotImplementedError(f"Validation not implemented for generic {origin}.")
263
+ return True
264
+
265
+ if isinstance(annotation, type):
266
+ if not isinstance(v, annotation):
267
+ raise TypeError(f"Expected value to have type {annotation}. Received type {type(v)}.")
268
+ return True
269
+
270
+ raise ValueError("Annotation to validate against should be a str, type, or generic.")
271
+
272
+ for k, v in o.items():
273
+ validate_annotation(v, annotations[k])
274
+
275
+ return cast(T_TypedDict, o)
276
+
277
+
278
+ def parse_quality_evaluator_reason_score(llm_output: str) -> Tuple[float, str]:
279
+ """Parse the output of prompt-based quality evaluators that return a score and reason.
280
+
281
+ Current supported evaluators:
282
+ - Fluency
283
+ - Relevance
284
+ - Retrieval
285
+ - Groundedness
286
+ - Coherence
287
+
288
+ :param llm_output: The output of the prompt-based quality evaluator.
289
+ :type llm_output: str
290
+ :return: The score and reason.
291
+ :rtype: Tuple[float, str]
292
+ """
293
+ score = math.nan
294
+ reason = ""
295
+ if llm_output:
296
+ score_pattern = r"<S2>(.*?)</S2>"
297
+ reason_pattern = r"<S1>(.*?)</S1>"
298
+ score_match = re.findall(score_pattern, llm_output, re.DOTALL)
299
+ reason_match = re.findall(reason_pattern, llm_output, re.DOTALL)
300
+ if score_match:
301
+ score = float(score_match[0].strip())
302
+ if reason_match:
303
+ reason = reason_match[0].strip()
304
+
305
+ return score, reason
306
+
307
+
308
+ def remove_optional_singletons(eval_class, singletons):
309
+ required_singletons = singletons.copy()
310
+ if hasattr(eval_class, "_OPTIONAL_PARAMS"): # pylint: disable=protected-access
311
+ for param in eval_class._OPTIONAL_PARAMS: # pylint: disable=protected-access
312
+ if param in singletons:
313
+ del required_singletons[param]
314
+ return required_singletons
315
+
316
+
317
+ def retrieve_content_type(assistant_messages: List, metric: str) -> str:
318
+ """Get the content type for service payload.
319
+
320
+ :param assistant_messages: The list of messages to be annotated by evaluation service
321
+ :type assistant_messages: list
322
+ :param metric: A string representing the metric type
323
+ :type metric: str
324
+ :return: A text representing the content type. Example: 'text', or 'image'
325
+ :rtype: str
326
+ """
327
+ # Check if metric is "protected_material"
328
+ if metric == "protected_material":
329
+ return "image"
330
+
331
+ # Iterate through each message
332
+ for item in assistant_messages:
333
+ # Ensure "content" exists in the message and is iterable
334
+ content = item.get("content", [])
335
+ for message in content:
336
+ if message.get("type", "") == "image_url":
337
+ return "image"
338
+ # Default return if no image was found
339
+ return "text"
340
+
341
+
342
+ def validate_conversation(conversation):
343
+ def raise_exception(msg, target):
344
+ raise EvaluationException(
345
+ message=msg,
346
+ internal_message=msg,
347
+ target=target,
348
+ category=ErrorCategory.INVALID_VALUE,
349
+ blame=ErrorBlame.USER_ERROR,
350
+ )
351
+
352
+ if not conversation or "messages" not in conversation:
353
+ raise_exception(
354
+ "Attribute 'messages' is missing in the request",
355
+ ErrorTarget.CONTENT_SAFETY_CHAT_EVALUATOR,
356
+ )
357
+ messages = conversation["messages"]
358
+ if not isinstance(messages, list):
359
+ raise_exception(
360
+ "'messages' parameter must be a JSON-compatible list of chat messages",
361
+ ErrorTarget.CONTENT_SAFETY_MULTIMODAL_EVALUATOR,
362
+ )
363
+ expected_roles = {"user", "assistant", "system"}
364
+ image_found = False
365
+ for num, message in enumerate(messages, 1):
366
+ if not isinstance(message, dict):
367
+ try:
368
+ from azure.ai.inference.models import (
369
+ ChatRequestMessage,
370
+ UserMessage,
371
+ AssistantMessage,
372
+ SystemMessage,
373
+ ImageContentItem,
374
+ )
375
+ except ImportError as ex:
376
+ raise MissingRequiredPackage(
377
+ message="Please install 'azure-ai-inference' package to use SystemMessage, AssistantMessage"
378
+ ) from ex
379
+
380
+ if isinstance(messages[0], ChatRequestMessage) and not isinstance(
381
+ message, (UserMessage, AssistantMessage, SystemMessage)
382
+ ):
383
+ raise_exception(
384
+ f"Messages must be a strongly typed class of ChatRequestMessage. Message number: {num}",
385
+ ErrorTarget.CONTENT_SAFETY_MULTIMODAL_EVALUATOR,
386
+ )
387
+
388
+ if isinstance(message.content, list) and any(
389
+ isinstance(item, ImageContentItem) for item in message.content
390
+ ):
391
+ image_found = True
392
+ continue
393
+ if message.get("role") not in expected_roles:
394
+ raise_exception(
395
+ f"Invalid role provided: {message.get('role')}. Message number: {num}",
396
+ ErrorTarget.CONTENT_SAFETY_MULTIMODAL_EVALUATOR,
397
+ )
398
+ content = message.get("content")
399
+ if not isinstance(content, (str, list)):
400
+ raise_exception(
401
+ f"Content in each turn must be a string or array. Message number: {num}",
402
+ ErrorTarget.CONTENT_SAFETY_MULTIMODAL_EVALUATOR,
403
+ )
404
+ if isinstance(content, list):
405
+ if any(item.get("type") == "image_url" and "url" in item.get("image_url", {}) for item in content):
406
+ image_found = True
407
+ if not image_found:
408
+ raise_exception(
409
+ "Message needs to have multi-modal input like images.",
410
+ ErrorTarget.CONTENT_SAFETY_MULTIMODAL_EVALUATOR,
411
+ )
@@ -1,18 +1,19 @@
1
1
  # ---------------------------------------------------------
2
2
  # Copyright (c) Microsoft Corporation. All rights reserved.
3
3
  # ---------------------------------------------------------
4
+ from typing import Literal
4
5
 
5
6
 
6
7
  class EvaluationMetrics:
7
8
  """Metrics for model evaluation."""
8
9
 
9
- GPT_GROUNDEDNESS = "gpt_groundedness"
10
- GPT_RELEVANCE = "gpt_relevance"
11
- GPT_COHERENCE = "gpt_coherence"
12
- GPT_FLUENCY = "gpt_fluency"
13
- GPT_SIMILARITY = "gpt_similarity"
10
+ GROUNDEDNESS = "groundedness"
11
+ RELEVANCE = "relevance"
12
+ COHERENCE = "coherence"
13
+ FLUENCY = "fluency"
14
+ SIMILARITY = "similarity"
14
15
  F1_SCORE = "f1_score"
15
- GPT_RETRIEVAL_SCORE = "gpt_retrieval_score"
16
+ RETRIEVAL_SCORE = "retrieval_score"
16
17
  HATE_FAIRNESS = "hate_fairness"
17
18
  HATE_UNFAIRNESS = "hate_unfairness"
18
19
  VIOLENCE = "violence"
@@ -48,6 +49,13 @@ class DefaultOpenEncoding:
48
49
  """SDK Default Encoding when writing a file"""
49
50
 
50
51
 
52
+ class EvaluationRunProperties:
53
+ """Defines properties used to identify an evaluation run by UI"""
54
+
55
+ RUN_TYPE = "runType"
56
+ EVALUATION_RUN = "_azureml.evaluation_run"
57
+
58
+
51
59
  DEFAULT_EVALUATION_RESULTS_FILE_NAME = "evaluation_results.json"
52
60
 
53
61
  CONTENT_SAFETY_DEFECT_RATE_THRESHOLD_DEFAULT = 4
@@ -58,6 +66,6 @@ PF_BATCH_TIMEOUT_SEC = "PF_BATCH_TIMEOUT_SEC"
58
66
  OTEL_EXPORTER_OTLP_TRACES_TIMEOUT = "OTEL_EXPORTER_OTLP_TRACES_TIMEOUT"
59
67
  OTEL_EXPORTER_OTLP_TRACES_TIMEOUT_DEFAULT = 60
60
68
 
61
- AZURE_OPENAI_TYPE = "azure_openai"
69
+ AZURE_OPENAI_TYPE: Literal["azure_openai"] = "azure_openai"
62
70
 
63
- OPENAI_TYPE = "openai"
71
+ OPENAI_TYPE: Literal["openai"] = "openai"
@@ -1,8 +1,9 @@
1
1
  # ---------------------------------------------------------
2
2
  # Copyright (c) Microsoft Corporation. All rights reserved.
3
3
  # ---------------------------------------------------------
4
- from .batch_run_context import BatchRunContext
4
+ from .eval_run_context import EvalRunContext
5
5
  from .code_client import CodeClient
6
6
  from .proxy_client import ProxyClient
7
+ from .target_run_context import TargetRunContext
7
8
 
8
- __all__ = ["CodeClient", "ProxyClient", "BatchRunContext"]
9
+ __all__ = ["CodeClient", "ProxyClient", "EvalRunContext", "TargetRunContext"]
@@ -5,8 +5,9 @@ import inspect
5
5
  import json
6
6
  import logging
7
7
  import os
8
+ from concurrent.futures import Future
8
9
  from pathlib import Path
9
- from typing import Callable, Dict, Optional, Union
10
+ from typing import Any, Callable, Dict, Optional, Union, cast
10
11
 
11
12
  import pandas as pd
12
13
  from promptflow.contracts.types import AttrDict
@@ -22,25 +23,31 @@ LOGGER = logging.getLogger(__name__)
22
23
 
23
24
  class CodeRun:
24
25
  def __init__(
25
- self, run, input_data, evaluator_name=None, aggregated_metrics=None, **kwargs # pylint: disable=unused-argument
26
- ):
26
+ self,
27
+ *,
28
+ run: Future,
29
+ input_data,
30
+ evaluator_name: Optional[str] = None,
31
+ aggregator: Callable[["CodeRun"], Future],
32
+ **kwargs, # pylint: disable=unused-argument
33
+ ) -> None:
27
34
  self.run = run
28
35
  self.evaluator_name = evaluator_name if evaluator_name is not None else ""
29
36
  self.input_data = input_data
30
- self.aggregated_metrics = aggregated_metrics
37
+ self.aggregated_metrics = aggregator(self)
31
38
 
32
- def get_result_df(self, exclude_inputs=False):
39
+ def get_result_df(self, exclude_inputs: bool = False) -> pd.DataFrame:
33
40
  batch_run_timeout = get_int_env_var(PF_BATCH_TIMEOUT_SEC, PF_BATCH_TIMEOUT_SEC_DEFAULT)
34
- result_df = self.run.result(timeout=batch_run_timeout)
41
+ result_df = cast(pd.DataFrame, self.run.result(timeout=batch_run_timeout))
35
42
  if exclude_inputs:
36
43
  result_df = result_df.drop(columns=[col for col in result_df.columns if col.startswith("inputs.")])
37
44
  return result_df
38
45
 
39
- def get_aggregated_metrics(self):
46
+ def get_aggregated_metrics(self) -> Dict[str, Any]:
40
47
  try:
41
48
  batch_run_timeout = get_int_env_var(PF_BATCH_TIMEOUT_SEC, PF_BATCH_TIMEOUT_SEC_DEFAULT)
42
- aggregated_metrics = (
43
- self.aggregated_metrics.result(timeout=batch_run_timeout)
49
+ aggregated_metrics: Optional[Any] = (
50
+ cast(Dict, self.aggregated_metrics.result(timeout=batch_run_timeout))
44
51
  if self.aggregated_metrics is not None
45
52
  else None
46
53
  )
@@ -104,10 +111,10 @@ class CodeClient: # pylint: disable=client-accepts-api-version-keyword
104
111
  verify_integrity=True,
105
112
  )
106
113
 
107
- def _calculate_aggregations(self, evaluator, run):
114
+ @staticmethod
115
+ def _calculate_aggregations(evaluator: Callable, run: CodeRun) -> Any:
108
116
  try:
109
117
  if _has_aggregator(evaluator):
110
- aggregate_input = None
111
118
  evaluator_output = run.get_result_df(exclude_inputs=True)
112
119
  if len(evaluator_output.columns) == 1 and evaluator_output.columns[0] == "output":
113
120
  aggregate_input = evaluator_output["output"].tolist()
@@ -152,21 +159,30 @@ class CodeClient: # pylint: disable=client-accepts-api-version-keyword
152
159
  column_mapping=column_mapping,
153
160
  evaluator_name=evaluator_name,
154
161
  )
155
- run = CodeRun(run=eval_future, input_data=data, evaluator_name=evaluator_name, aggregated_metrics=None)
156
- aggregation_future = self._thread_pool.submit(self._calculate_aggregations, evaluator=flow, run=run)
157
- run.aggregated_metrics = aggregation_future
158
- return run
162
+
163
+ return CodeRun(
164
+ run=eval_future,
165
+ input_data=data,
166
+ evaluator_name=evaluator_name,
167
+ aggregator=lambda code_run: self._thread_pool.submit(
168
+ self._calculate_aggregations, evaluator=flow, run=code_run
169
+ ),
170
+ )
159
171
 
160
172
  def get_details(self, run: CodeRun, all_results: bool = False) -> pd.DataFrame:
161
173
  result_df = run.get_result_df(exclude_inputs=not all_results)
162
174
  return result_df
163
175
 
164
- def get_metrics(self, run: CodeRun) -> Optional[None]:
176
+ def get_metrics(self, run: CodeRun) -> Dict[str, Any]:
165
177
  try:
166
178
  aggregated_metrics = run.get_aggregated_metrics()
167
179
  print("Aggregated metrics")
168
180
  print(aggregated_metrics)
169
181
  except Exception as ex: # pylint: disable=broad-exception-caught
170
182
  LOGGER.debug("Error calculating metrics for evaluator %s, failed with error %s", run.evaluator_name, ex)
171
- return None
183
+ return {}
172
184
  return aggregated_metrics
185
+
186
+ def get_run_summary(self, run: CodeRun) -> Any: # pylint: disable=unused-argument
187
+ # Not implemented
188
+ return None
@@ -2,6 +2,8 @@
2
2
  # Copyright (c) Microsoft Corporation. All rights reserved.
3
3
  # ---------------------------------------------------------
4
4
  import os
5
+ import types
6
+ from typing import Optional, Type, Union
5
7
 
6
8
  from promptflow._sdk._constants import PF_FLOW_ENTRY_IN_TMP, PF_FLOW_META_LOAD_IN_SUBPROCESS
7
9
  from promptflow._utils.user_agent_utils import ClientUserAgentUtil
@@ -20,22 +22,22 @@ from .code_client import CodeClient
20
22
  from .proxy_client import ProxyClient
21
23
 
22
24
 
23
- class BatchRunContext:
24
- """Context manager for batch run clients.
25
+ class EvalRunContext:
26
+ """Context manager for eval batch run.
25
27
 
26
28
  :param client: The client to run in the context.
27
29
  :type client: Union[
28
- ~azure.ai.evaluation._evaluate._batch_run_client.code_client.CodeClient,
29
- ~azure.ai.evaluation._evaluate._batch_run_client.proxy_client.ProxyClient
30
+ ~azure.ai.evaluation._evaluate._batch_run.code_client.CodeClient,
31
+ ~azure.ai.evaluation._evaluate._batch_run.proxy_client.ProxyClient
30
32
  ]
31
33
  """
32
34
 
33
- def __init__(self, client) -> None:
35
+ def __init__(self, client: Union[CodeClient, ProxyClient]) -> None:
34
36
  self.client = client
35
37
  self._is_batch_timeout_set_by_system = False
36
38
  self._is_otel_timeout_set_by_system = False
37
39
 
38
- def __enter__(self):
40
+ def __enter__(self) -> None:
39
41
  if isinstance(self.client, CodeClient):
40
42
  ClientUserAgentUtil.append_user_agent(USER_AGENT)
41
43
  inject_openai_api()
@@ -56,7 +58,12 @@ class BatchRunContext:
56
58
  # For addressing the issue of asyncio event loop closed on Windows
57
59
  set_event_loop_policy()
58
60
 
59
- def __exit__(self, exc_type, exc_val, exc_tb):
61
+ def __exit__(
62
+ self,
63
+ exc_type: Optional[Type[BaseException]],
64
+ exc_value: Optional[BaseException],
65
+ exc_tb: Optional[types.TracebackType],
66
+ ) -> None:
60
67
  if isinstance(self.client, CodeClient):
61
68
  recover_openai_api()
62
69
 
@@ -1,13 +1,17 @@
1
1
  # ---------------------------------------------------------
2
2
  # Copyright (c) Microsoft Corporation. All rights reserved.
3
3
  # ---------------------------------------------------------
4
+
5
+ # pylint: disable=protected-access
6
+
4
7
  import inspect
5
8
  import logging
9
+ import math
6
10
  import os
11
+ from collections import OrderedDict
7
12
  from concurrent.futures import Future
8
13
  from typing import Any, Callable, Dict, Optional, Union
9
14
 
10
- import numpy as np
11
15
  import pandas as pd
12
16
  from promptflow.client import PFClient
13
17
  from promptflow.entities import Run
@@ -36,7 +40,7 @@ class ProxyClient: # pylint: disable=client-accepts-api-version-keyword
36
40
  **kwargs
37
41
  ) -> ProxyRun:
38
42
  flow_to_run = flow
39
- if hasattr(flow, "_to_async"):
43
+ if os.getenv("AI_EVALS_BATCH_USE_ASYNC", "true").lower() == "true" and hasattr(flow, "_to_async"):
40
44
  flow_to_run = flow._to_async() # pylint: disable=protected-access
41
45
 
42
46
  batch_use_async = self._should_batch_use_async(flow_to_run)
@@ -53,16 +57,30 @@ class ProxyClient: # pylint: disable=client-accepts-api-version-keyword
53
57
  def get_details(self, proxy_run: ProxyRun, all_results: bool = False) -> pd.DataFrame:
54
58
  run: Run = proxy_run.run.result()
55
59
  result_df = self._pf_client.get_details(run, all_results=all_results)
56
- result_df.replace("(Failed)", np.nan, inplace=True)
60
+ result_df.replace("(Failed)", math.nan, inplace=True)
57
61
  return result_df
58
62
 
59
63
  def get_metrics(self, proxy_run: ProxyRun) -> Dict[str, Any]:
60
64
  run: Run = proxy_run.run.result()
61
65
  return self._pf_client.get_metrics(run)
62
66
 
67
+ def get_run_summary(self, proxy_run: ProxyRun) -> Dict[str, Any]:
68
+ run = proxy_run.run.result()
69
+
70
+ # pylint: disable=protected-access
71
+ return OrderedDict(
72
+ [
73
+ ("status", run.status),
74
+ ("duration", str(run._end_time - run._created_on)),
75
+ ("completed_lines", run._properties.get("system_metrics", {}).get("__pf__.lines.completed", "NA")),
76
+ ("failed_lines", run._properties.get("system_metrics", {}).get("__pf__.lines.failed", "NA")),
77
+ ("log_path", str(run._output_path)),
78
+ ]
79
+ )
80
+
63
81
  @staticmethod
64
82
  def _should_batch_use_async(flow):
65
- if os.getenv("PF_EVALS_BATCH_USE_ASYNC", "true").lower() == "true":
83
+ if os.getenv("AI_EVALS_BATCH_USE_ASYNC", "true").lower() == "true":
66
84
  if hasattr(flow, "__call__") and inspect.iscoroutinefunction(flow.__call__):
67
85
  return True
68
86
  if inspect.iscoroutinefunction(flow):