azure-ai-evaluation 1.0.0b5__py3-none-any.whl → 1.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (61) hide show
  1. azure/ai/evaluation/_common/_experimental.py +4 -0
  2. azure/ai/evaluation/_common/math.py +62 -2
  3. azure/ai/evaluation/_common/rai_service.py +80 -29
  4. azure/ai/evaluation/_common/utils.py +50 -16
  5. azure/ai/evaluation/_constants.py +1 -0
  6. azure/ai/evaluation/_evaluate/_batch_run/eval_run_context.py +9 -0
  7. azure/ai/evaluation/_evaluate/_batch_run/proxy_client.py +13 -3
  8. azure/ai/evaluation/_evaluate/_batch_run/target_run_context.py +11 -0
  9. azure/ai/evaluation/_evaluate/_eval_run.py +34 -10
  10. azure/ai/evaluation/_evaluate/_evaluate.py +59 -103
  11. azure/ai/evaluation/_evaluate/_telemetry/__init__.py +2 -1
  12. azure/ai/evaluation/_evaluate/_utils.py +6 -4
  13. azure/ai/evaluation/_evaluators/_bleu/_bleu.py +16 -17
  14. azure/ai/evaluation/_evaluators/_coherence/_coherence.py +60 -29
  15. azure/ai/evaluation/_evaluators/_common/_base_eval.py +17 -5
  16. azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +4 -2
  17. azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py +6 -9
  18. azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py +56 -50
  19. azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py +79 -34
  20. azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py +73 -34
  21. azure/ai/evaluation/_evaluators/_content_safety/_sexual.py +74 -33
  22. azure/ai/evaluation/_evaluators/_content_safety/_violence.py +76 -34
  23. azure/ai/evaluation/_evaluators/_eci/_eci.py +28 -3
  24. azure/ai/evaluation/_evaluators/_f1_score/_f1_score.py +20 -13
  25. azure/ai/evaluation/_evaluators/_fluency/_fluency.py +57 -26
  26. azure/ai/evaluation/_evaluators/_gleu/_gleu.py +13 -15
  27. azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +68 -30
  28. azure/ai/evaluation/_evaluators/_meteor/_meteor.py +17 -20
  29. azure/ai/evaluation/_evaluators/_multimodal/_content_safety_multimodal.py +10 -8
  30. azure/ai/evaluation/_evaluators/_multimodal/_content_safety_multimodal_base.py +0 -2
  31. azure/ai/evaluation/_evaluators/_multimodal/_hate_unfairness.py +6 -2
  32. azure/ai/evaluation/_evaluators/_multimodal/_protected_material.py +10 -6
  33. azure/ai/evaluation/_evaluators/_multimodal/_self_harm.py +6 -2
  34. azure/ai/evaluation/_evaluators/_multimodal/_sexual.py +6 -2
  35. azure/ai/evaluation/_evaluators/_multimodal/_violence.py +6 -2
  36. azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py +57 -34
  37. azure/ai/evaluation/_evaluators/_qa/_qa.py +25 -37
  38. azure/ai/evaluation/_evaluators/_relevance/_relevance.py +63 -29
  39. azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py +76 -161
  40. azure/ai/evaluation/_evaluators/_rouge/_rouge.py +24 -25
  41. azure/ai/evaluation/_evaluators/_service_groundedness/_service_groundedness.py +65 -67
  42. azure/ai/evaluation/_evaluators/_similarity/_similarity.py +26 -20
  43. azure/ai/evaluation/_evaluators/_xpia/xpia.py +74 -40
  44. azure/ai/evaluation/_exceptions.py +2 -0
  45. azure/ai/evaluation/_model_configurations.py +65 -14
  46. azure/ai/evaluation/_version.py +1 -1
  47. azure/ai/evaluation/simulator/_adversarial_scenario.py +15 -1
  48. azure/ai/evaluation/simulator/_adversarial_simulator.py +25 -34
  49. azure/ai/evaluation/simulator/_constants.py +11 -1
  50. azure/ai/evaluation/simulator/_direct_attack_simulator.py +16 -8
  51. azure/ai/evaluation/simulator/_indirect_attack_simulator.py +11 -1
  52. azure/ai/evaluation/simulator/_model_tools/_identity_manager.py +3 -1
  53. azure/ai/evaluation/simulator/_model_tools/_rai_client.py +8 -4
  54. azure/ai/evaluation/simulator/_simulator.py +51 -45
  55. azure/ai/evaluation/simulator/_utils.py +25 -7
  56. {azure_ai_evaluation-1.0.0b5.dist-info → azure_ai_evaluation-1.0.1.dist-info}/METADATA +232 -324
  57. {azure_ai_evaluation-1.0.0b5.dist-info → azure_ai_evaluation-1.0.1.dist-info}/RECORD +60 -61
  58. azure/ai/evaluation/_evaluators/_content_safety/_content_safety_chat.py +0 -322
  59. {azure_ai_evaluation-1.0.0b5.dist-info → azure_ai_evaluation-1.0.1.dist-info}/NOTICE.txt +0 -0
  60. {azure_ai_evaluation-1.0.0b5.dist-info → azure_ai_evaluation-1.0.1.dist-info}/WHEEL +0 -0
  61. {azure_ai_evaluation-1.0.0b5.dist-info → azure_ai_evaluation-1.0.1.dist-info}/top_level.txt +0 -0
@@ -2,6 +2,7 @@
2
2
  # Copyright (c) Microsoft Corporation. All rights reserved.
3
3
  # ---------------------------------------------------------
4
4
 
5
+ import os
5
6
  import functools
6
7
  import inspect
7
8
  import logging
@@ -149,6 +150,9 @@ def _get_indentation_size(doc_string: str) -> int:
149
150
  def _should_skip_warning():
150
151
  skip_warning_msg = False
151
152
 
153
+ if os.getenv("AI_EVALS_DISABLE_EXPERIMENTAL_WARNING", "false").lower() == "true":
154
+ skip_warning_msg = True
155
+
152
156
  # Cases where we want to suppress the warning:
153
157
  # 1. When converting from REST object to SDK object
154
158
  for frame in inspect.stack():
@@ -3,20 +3,44 @@
3
3
  # ---------------------------------------------------------
4
4
 
5
5
  import math
6
- from typing import List
6
+ from typing import List, Callable, Any
7
7
 
8
8
  from azure.ai.evaluation._exceptions import EvaluationException, ErrorBlame, ErrorCategory, ErrorTarget
9
9
 
10
10
 
11
11
  def list_sum(lst: List[float]) -> float:
12
+ """Given a list of floats, return the sum of the values.
13
+
14
+ :param lst: A list of floats.
15
+ :type lst: List[float]
16
+ :return: The sum of the values in the list.
17
+ :rtype: float
18
+ """
19
+
12
20
  return sum(lst)
13
21
 
14
22
 
15
23
  def list_mean(lst: List[float]) -> float:
24
+ """Given a list of floats, calculate the mean of the values.
25
+
26
+ :param lst: A list of floats.
27
+ :type lst: List[float]
28
+ :return: The mean of the values in the list.
29
+ :rtype: float
30
+ """
31
+
16
32
  return list_sum(lst) / len(lst)
17
33
 
18
34
 
19
35
  def list_mean_nan_safe(lst: List[float]) -> float:
36
+ """Given a list of floats, remove all nan or None values, then calculate the mean of the remaining values.
37
+
38
+ :param lst: A list of floats.
39
+ :type lst: List[float]
40
+ :return: The mean of the values in the list.
41
+ :rtype: float
42
+ """
43
+
20
44
  msg = "All score values are NaN. The mean cannot be calculated."
21
45
  if all(math.isnan(l) for l in lst):
22
46
  raise EvaluationException(
@@ -26,4 +50,40 @@ def list_mean_nan_safe(lst: List[float]) -> float:
26
50
  category=ErrorCategory.INVALID_VALUE,
27
51
  target=ErrorTarget.CONVERSATION,
28
52
  )
29
- return list_mean([l for l in lst if not math.isnan(l)])
53
+ return list_mean([l for l in lst if not is_none_or_nan(l)])
54
+
55
+
56
+ def apply_transform_nan_safe(lst: List[float], transform_fn: Callable[[float], Any]) -> List[Any]:
57
+ """Given a list of floats, remove all nan values, then apply the inputted transform function
58
+ to the remaining values, and return the resulting list of outputted values.
59
+
60
+ :param lst: A list of floats.
61
+ :type lst: List[float]
62
+ :param transform_fn: A function that produces something when applied to a float.
63
+ :type transform_fn: Callable[[float], Any]
64
+ :return: A list of the transformed values.
65
+ :rtype: List[Any]
66
+ """
67
+
68
+ msg = "All score values are NaN. The mean cannot be calculated."
69
+ if all(math.isnan(l) for l in lst):
70
+ raise EvaluationException(
71
+ message=msg,
72
+ internal_message=msg,
73
+ blame=ErrorBlame.USER_ERROR,
74
+ category=ErrorCategory.INVALID_VALUE,
75
+ target=ErrorTarget.CONVERSATION,
76
+ )
77
+ return [transform_fn(l) for l in lst if not is_none_or_nan(l)]
78
+
79
+
80
+ def is_none_or_nan(val: float) -> bool:
81
+ """math.isnan raises an error if None is inputted. This is a more robust wrapper.
82
+
83
+ :param val: The value to check.
84
+ :type val: float
85
+ :return: Whether the value is None or NaN.
86
+ :rtype: bool
87
+ """
88
+
89
+ return val is None or math.isnan(val)
@@ -6,6 +6,8 @@ import importlib.metadata
6
6
  import math
7
7
  import re
8
8
  import time
9
+ import json
10
+ import html
9
11
  from ast import literal_eval
10
12
  from typing import Dict, List, Optional, Union, cast
11
13
  from urllib.parse import urlparse
@@ -38,10 +40,38 @@ USER_AGENT = "{}/{}".format("azure-ai-evaluation", version)
38
40
 
39
41
  USER_TEXT_TEMPLATE_DICT: Dict[str, Template] = {
40
42
  "DEFAULT": Template("<Human>{$query}</><System>{$response}</>"),
41
- Tasks.GROUNDEDNESS: Template('{"question": "$query", "answer": "$response", "context": "$context"}'),
42
43
  }
43
44
 
44
45
 
46
+ def get_formatted_template(data: dict, annotation_task: str) -> str:
47
+ """Given the task and input data, produce a formatted string that will serve as the main
48
+ payload for the RAI service. Requires specific per-task logic.
49
+
50
+ :param data: The data to incorporate into the payload.
51
+ :type data: dict
52
+ :param annotation_task: The annotation task to use. This determines the template to use.
53
+ :type annotation_task: str
54
+ :return: The formatted based on the data and task template.
55
+ :rtype: str
56
+ """
57
+ # Template class doesn't play nice with json dumping/loading, just handle groundedness'
58
+ # JSON format manually.
59
+ # Template was: Template('{"question": "$query", "answer": "$response", "context": "$context"}'),
60
+ if annotation_task == Tasks.GROUNDEDNESS:
61
+ as_dict = {
62
+ "question": data.get("query", ""),
63
+ "answer": data.get("response", ""),
64
+ "context": data.get("context", ""),
65
+ }
66
+ return json.dumps(as_dict)
67
+ as_dict = {
68
+ "query": html.escape(data.get("query", "")),
69
+ "response": html.escape(data.get("response", "")),
70
+ }
71
+ user_text = USER_TEXT_TEMPLATE_DICT.get(annotation_task, USER_TEXT_TEMPLATE_DICT["DEFAULT"]).substitute(**as_dict)
72
+ return user_text.replace("'", '\\"')
73
+
74
+
45
75
  def get_common_headers(token: str) -> Dict:
46
76
  """Get common headers for the HTTP request
47
77
 
@@ -83,27 +113,31 @@ async def ensure_service_availability(rai_svc_url: str, token: str, capability:
83
113
  async with get_async_http_client() as client:
84
114
  response = await client.get(svc_liveness_url, headers=headers)
85
115
 
86
- if response.status_code != 200:
87
- msg = f"RAI service is not available in this region. Status Code: {response.status_code}"
88
- raise EvaluationException(
89
- message=msg,
90
- internal_message=msg,
91
- target=ErrorTarget.UNKNOWN,
92
- category=ErrorCategory.SERVICE_UNAVAILABLE,
93
- blame=ErrorBlame.USER_ERROR,
94
- )
95
-
96
- capabilities = response.json()
116
+ if response.status_code != 200:
117
+ msg = (
118
+ f"RAI service is unavailable in this region, or you lack the necessary permissions "
119
+ f"to access the AI project. Status Code: {response.status_code}"
120
+ )
121
+ raise EvaluationException(
122
+ message=msg,
123
+ internal_message=msg,
124
+ target=ErrorTarget.RAI_CLIENT,
125
+ category=ErrorCategory.SERVICE_UNAVAILABLE,
126
+ blame=ErrorBlame.USER_ERROR,
127
+ tsg_link="https://aka.ms/azsdk/python/evaluation/safetyevaluator/troubleshoot",
128
+ )
97
129
 
98
- if capability and capability not in capabilities:
99
- msg = f"Capability '{capability}' is not available in this region"
100
- raise EvaluationException(
101
- message=msg,
102
- internal_message=msg,
103
- target=ErrorTarget.RAI_CLIENT,
104
- category=ErrorCategory.SERVICE_UNAVAILABLE,
105
- blame=ErrorBlame.USER_ERROR,
106
- )
130
+ capabilities = response.json()
131
+ if capability and capability not in capabilities:
132
+ msg = f"The needed capability '{capability}' is not supported by the RAI service in this region."
133
+ raise EvaluationException(
134
+ message=msg,
135
+ internal_message=msg,
136
+ target=ErrorTarget.RAI_CLIENT,
137
+ category=ErrorCategory.SERVICE_UNAVAILABLE,
138
+ blame=ErrorBlame.USER_ERROR,
139
+ tsg_link="https://aka.ms/azsdk/python/evaluation/safetyevaluator/troubleshoot",
140
+ )
107
141
 
108
142
 
109
143
  def generate_payload(normalized_user_text: str, metric: str, annotation_task: str) -> Dict:
@@ -157,8 +191,7 @@ async def submit_request(data: dict, metric: str, rai_svc_url: str, token: str,
157
191
  :return: The operation ID.
158
192
  :rtype: str
159
193
  """
160
- user_text = USER_TEXT_TEMPLATE_DICT.get(annotation_task, USER_TEXT_TEMPLATE_DICT["DEFAULT"]).substitute(**data)
161
- normalized_user_text = user_text.replace("'", '\\"')
194
+ normalized_user_text = get_formatted_template(data, annotation_task)
162
195
  payload = generate_payload(normalized_user_text, metric, annotation_task=annotation_task)
163
196
 
164
197
  url = rai_svc_url + "/submitannotation"
@@ -235,13 +268,27 @@ def parse_response( # pylint: disable=too-many-branches,too-many-statements
235
268
  _InternalEvaluationMetrics.ECI,
236
269
  EvaluationMetrics.XPIA,
237
270
  }:
238
- if not batch_response or len(batch_response[0]) == 0 or metric_name not in batch_response[0]:
271
+ result = {}
272
+ if not batch_response or len(batch_response[0]) == 0:
273
+ return {}
274
+ if metric_name == EvaluationMetrics.PROTECTED_MATERIAL and metric_name not in batch_response[0]:
275
+ pm_metric_names = {"artwork", "fictional_characters", "logos_and_brands"}
276
+ for pm_metric_name in pm_metric_names:
277
+ response = batch_response[0][pm_metric_name]
278
+ response = response.replace("false", "False")
279
+ response = response.replace("true", "True")
280
+ parsed_response = literal_eval(response)
281
+ result[pm_metric_name + "_label"] = parsed_response["label"] if "label" in parsed_response else math.nan
282
+ result[pm_metric_name + "_reason"] = (
283
+ parsed_response["reasoning"] if "reasoning" in parsed_response else ""
284
+ )
285
+ return result
286
+ if metric_name not in batch_response[0]:
239
287
  return {}
240
288
  response = batch_response[0][metric_name]
241
289
  response = response.replace("false", "False")
242
290
  response = response.replace("true", "True")
243
291
  parsed_response = literal_eval(response)
244
- result = {}
245
292
  # Use label instead of score since these are assumed to be boolean results.
246
293
  # Use math.nan as null value since it's ignored by aggregations rather than treated as 0.
247
294
  result[metric_display_name + "_label"] = parsed_response["label"] if "label" in parsed_response else math.nan
@@ -371,13 +418,17 @@ async def _get_service_discovery_url(azure_ai_project: AzureAIProject, token: st
371
418
  )
372
419
 
373
420
  if response.status_code != 200:
374
- msg = "Failed to retrieve the discovery service URL."
421
+ msg = (
422
+ f"Failed to connect to your Azure AI project. Please check if the project scope is configured correctly, "
423
+ f"and make sure you have the necessary access permissions. "
424
+ f"Status code: {response.status_code}."
425
+ )
375
426
  raise EvaluationException(
376
427
  message=msg,
377
- internal_message=msg,
378
428
  target=ErrorTarget.RAI_CLIENT,
379
- category=ErrorCategory.SERVICE_UNAVAILABLE,
380
- blame=ErrorBlame.UNKNOWN,
429
+ blame=ErrorBlame.USER_ERROR,
430
+ category=ErrorCategory.PROJECT_ACCESS_ERROR,
431
+ tsg_link="https://aka.ms/azsdk/python/evaluation/safetyevaluator/troubleshoot",
381
432
  )
382
433
 
383
434
  base_url = urlparse(response.json()["properties"]["discoveryUrl"])
@@ -293,14 +293,22 @@ def parse_quality_evaluator_reason_score(llm_output: str) -> Tuple[float, str]:
293
293
  score = math.nan
294
294
  reason = ""
295
295
  if llm_output:
296
- score_pattern = r"<S2>(.*?)</S2>"
297
- reason_pattern = r"<S1>(.*?)</S1>"
298
- score_match = re.findall(score_pattern, llm_output, re.DOTALL)
299
- reason_match = re.findall(reason_pattern, llm_output, re.DOTALL)
300
- if score_match:
301
- score = float(score_match[0].strip())
302
- if reason_match:
303
- reason = reason_match[0].strip()
296
+ try:
297
+ score_pattern = r"<S2>\D*?([1-5]).*?</S2>"
298
+ reason_pattern = r"<S1>(.*?)</S1>"
299
+ score_match = re.findall(score_pattern, llm_output, re.DOTALL)
300
+ reason_match = re.findall(reason_pattern, llm_output, re.DOTALL)
301
+ if score_match:
302
+ score = float(score_match[0].strip())
303
+ if reason_match:
304
+ reason = reason_match[0].strip()
305
+ except ValueError as exc:
306
+ raise EvaluationException(
307
+ message=f"Failed to parse model output: \n{llm_output}",
308
+ internal_message="Failed to parse model output.",
309
+ category=ErrorCategory.FAILED_EXECUTION,
310
+ blame=ErrorBlame.SYSTEM_ERROR,
311
+ ) from exc
304
312
 
305
313
  return score, reason
306
314
 
@@ -329,12 +337,12 @@ def retrieve_content_type(assistant_messages: List, metric: str) -> str:
329
337
  return "image"
330
338
 
331
339
  # Iterate through each message
332
- for item in assistant_messages:
340
+ for message in assistant_messages:
333
341
  # Ensure "content" exists in the message and is iterable
334
- content = item.get("content", [])
335
- for message in content:
336
- if message.get("type", "") == "image_url":
337
- return "image"
342
+ if isinstance(message.get("content", []), list):
343
+ for content in message.get("content", []):
344
+ if content.get("type") == "image_url":
345
+ return "image"
338
346
  # Default return if no image was found
339
347
  return "text"
340
348
 
@@ -362,6 +370,8 @@ def validate_conversation(conversation):
362
370
  )
363
371
  expected_roles = {"user", "assistant", "system"}
364
372
  image_found = False
373
+ assistant_message_count = 0
374
+ user_message_count = 0
365
375
  for num, message in enumerate(messages, 1):
366
376
  if not isinstance(message, dict):
367
377
  try:
@@ -374,17 +384,21 @@ def validate_conversation(conversation):
374
384
  )
375
385
  except ImportError as ex:
376
386
  raise MissingRequiredPackage(
377
- message="Please install 'azure-ai-inference' package to use SystemMessage, AssistantMessage"
387
+ message="Please install 'azure-ai-inference' package to use SystemMessage, "
388
+ "UserMessage or AssistantMessage."
378
389
  ) from ex
379
390
 
380
- if isinstance(messages[0], ChatRequestMessage) and not isinstance(
391
+ if isinstance(message, ChatRequestMessage) and not isinstance(
381
392
  message, (UserMessage, AssistantMessage, SystemMessage)
382
393
  ):
383
394
  raise_exception(
384
395
  f"Messages must be a strongly typed class of ChatRequestMessage. Message number: {num}",
385
396
  ErrorTarget.CONTENT_SAFETY_MULTIMODAL_EVALUATOR,
386
397
  )
387
-
398
+ if isinstance(message, AssistantMessage):
399
+ assistant_message_count += 1
400
+ if isinstance(message, UserMessage):
401
+ user_message_count += 1
388
402
  if isinstance(message.content, list) and any(
389
403
  isinstance(item, ImageContentItem) for item in message.content
390
404
  ):
@@ -395,6 +409,10 @@ def validate_conversation(conversation):
395
409
  f"Invalid role provided: {message.get('role')}. Message number: {num}",
396
410
  ErrorTarget.CONTENT_SAFETY_MULTIMODAL_EVALUATOR,
397
411
  )
412
+ if message.get("role") == "assistant":
413
+ assistant_message_count += 1
414
+ if message.get("role") == "user":
415
+ user_message_count += 1
398
416
  content = message.get("content")
399
417
  if not isinstance(content, (str, list)):
400
418
  raise_exception(
@@ -409,3 +427,19 @@ def validate_conversation(conversation):
409
427
  "Message needs to have multi-modal input like images.",
410
428
  ErrorTarget.CONTENT_SAFETY_MULTIMODAL_EVALUATOR,
411
429
  )
430
+ if assistant_message_count == 0:
431
+ raise_exception(
432
+ "Assistant role required in one of the messages.",
433
+ ErrorTarget.CONTENT_SAFETY_MULTIMODAL_EVALUATOR,
434
+ )
435
+ if user_message_count == 0:
436
+ raise_exception(
437
+ "User role required in one of the messages.",
438
+ ErrorTarget.CONTENT_SAFETY_MULTIMODAL_EVALUATOR,
439
+ )
440
+ if assistant_message_count > 1:
441
+ raise_exception(
442
+ "Evaluators for multimodal conversations only support single turn. "
443
+ "User and assistant role expected as the only role in each message.",
444
+ ErrorTarget.CONTENT_SAFETY_MULTIMODAL_EVALUATOR,
445
+ )
@@ -62,6 +62,7 @@ CONTENT_SAFETY_DEFECT_RATE_THRESHOLD_DEFAULT = 4
62
62
 
63
63
  PF_BATCH_TIMEOUT_SEC_DEFAULT = 3600
64
64
  PF_BATCH_TIMEOUT_SEC = "PF_BATCH_TIMEOUT_SEC"
65
+ PF_DISABLE_TRACING = "PF_DISABLE_TRACING"
65
66
 
66
67
  OTEL_EXPORTER_OTLP_TRACES_TIMEOUT = "OTEL_EXPORTER_OTLP_TRACES_TIMEOUT"
67
68
  OTEL_EXPORTER_OTLP_TRACES_TIMEOUT_DEFAULT = 60
@@ -14,6 +14,7 @@ from azure.ai.evaluation._constants import (
14
14
  OTEL_EXPORTER_OTLP_TRACES_TIMEOUT_DEFAULT,
15
15
  PF_BATCH_TIMEOUT_SEC,
16
16
  PF_BATCH_TIMEOUT_SEC_DEFAULT,
17
+ PF_DISABLE_TRACING,
17
18
  )
18
19
 
19
20
  from ..._user_agent import USER_AGENT
@@ -36,8 +37,12 @@ class EvalRunContext:
36
37
  self.client = client
37
38
  self._is_batch_timeout_set_by_system = False
38
39
  self._is_otel_timeout_set_by_system = False
40
+ self._original_cwd = os.getcwd()
39
41
 
40
42
  def __enter__(self) -> None:
43
+ # Preserve current working directory, as PF may change it without restoring it afterward
44
+ self._original_cwd = os.getcwd()
45
+
41
46
  if isinstance(self.client, CodeClient):
42
47
  ClientUserAgentUtil.append_user_agent(USER_AGENT)
43
48
  inject_openai_api()
@@ -45,6 +50,7 @@ class EvalRunContext:
45
50
  if isinstance(self.client, ProxyClient):
46
51
  os.environ[PF_FLOW_ENTRY_IN_TMP] = "true"
47
52
  os.environ[PF_FLOW_META_LOAD_IN_SUBPROCESS] = "false"
53
+ os.environ[PF_DISABLE_TRACING] = "true"
48
54
 
49
55
  if os.environ.get(PF_BATCH_TIMEOUT_SEC) is None:
50
56
  os.environ[PF_BATCH_TIMEOUT_SEC] = str(PF_BATCH_TIMEOUT_SEC_DEFAULT)
@@ -64,12 +70,15 @@ class EvalRunContext:
64
70
  exc_value: Optional[BaseException],
65
71
  exc_tb: Optional[types.TracebackType],
66
72
  ) -> None:
73
+ os.chdir(self._original_cwd)
74
+
67
75
  if isinstance(self.client, CodeClient):
68
76
  recover_openai_api()
69
77
 
70
78
  if isinstance(self.client, ProxyClient):
71
79
  os.environ.pop(PF_FLOW_ENTRY_IN_TMP, None)
72
80
  os.environ.pop(PF_FLOW_META_LOAD_IN_SUBPROCESS, None)
81
+ os.environ.pop(PF_DISABLE_TRACING, None)
73
82
 
74
83
  if self._is_batch_timeout_set_by_system:
75
84
  os.environ.pop(PF_BATCH_TIMEOUT_SEC, None)
@@ -68,12 +68,22 @@ class ProxyClient: # pylint: disable=client-accepts-api-version-keyword
68
68
  run = proxy_run.run.result()
69
69
 
70
70
  # pylint: disable=protected-access
71
+ completed_lines = run._properties.get("system_metrics", {}).get("__pf__.lines.completed", "NA")
72
+ failed_lines = run._properties.get("system_metrics", {}).get("__pf__.lines.failed", "NA")
73
+
74
+ # Update status to "Completed with Errors" if the original status is "Completed" and there are failed lines
75
+ if run.status == "Completed" and failed_lines != "NA" and int(failed_lines) > 0:
76
+ status = "Completed with Errors"
77
+ else:
78
+ status = run.status
79
+
80
+ # Return the ordered dictionary with the updated status
71
81
  return OrderedDict(
72
82
  [
73
- ("status", run.status),
83
+ ("status", status),
74
84
  ("duration", str(run._end_time - run._created_on)),
75
- ("completed_lines", run._properties.get("system_metrics", {}).get("__pf__.lines.completed", "NA")),
76
- ("failed_lines", run._properties.get("system_metrics", {}).get("__pf__.lines.failed", "NA")),
85
+ ("completed_lines", completed_lines),
86
+ ("failed_lines", failed_lines),
77
87
  ("log_path", str(run._output_path)),
78
88
  ]
79
89
  )
@@ -6,6 +6,7 @@ import types
6
6
  from typing import Optional, Type
7
7
 
8
8
  from promptflow._sdk._constants import PF_FLOW_ENTRY_IN_TMP
9
+ from azure.ai.evaluation._constants import PF_DISABLE_TRACING
9
10
 
10
11
 
11
12
  class TargetRunContext:
@@ -17,19 +18,29 @@ class TargetRunContext:
17
18
 
18
19
  def __init__(self, upload_snapshot: bool) -> None:
19
20
  self._upload_snapshot = upload_snapshot
21
+ self._original_cwd = os.getcwd()
20
22
 
21
23
  def __enter__(self) -> None:
24
+ # Preserve current working directory, as PF may change it without restoring it afterward
25
+ self._original_cwd = os.getcwd()
26
+
22
27
  # Address "[WinError 32] The process cannot access the file" error,
23
28
  # caused by conflicts when the venv and target function are in the same directory.
24
29
  # Setting PF_FLOW_ENTRY_IN_TMP to true uploads only the flex entry file (flow.flex.yaml).
25
30
  if not self._upload_snapshot:
26
31
  os.environ[PF_FLOW_ENTRY_IN_TMP] = "true"
27
32
 
33
+ os.environ[PF_DISABLE_TRACING] = "true"
34
+
28
35
  def __exit__(
29
36
  self,
30
37
  exc_type: Optional[Type[BaseException]],
31
38
  exc_value: Optional[BaseException],
32
39
  exc_tb: Optional[types.TracebackType],
33
40
  ) -> None:
41
+ os.chdir(self._original_cwd)
42
+
34
43
  if not self._upload_snapshot:
35
44
  os.environ.pop(PF_FLOW_ENTRY_IN_TMP, None)
45
+
46
+ os.environ.pop(PF_DISABLE_TRACING, None)
@@ -34,14 +34,15 @@ try:
34
34
  from azure.ai.ml.entities._datastore.datastore import Datastore
35
35
  from azure.storage.blob import BlobServiceClient
36
36
  except (ModuleNotFoundError, ImportError):
37
- # If the above mentioned modules cannot be imported, we are running
38
- # in local mode and MLClient in the constructor will be None, so
39
- # we will not arrive to Azure-dependent code.
40
-
41
- # We are logging the import failure only if debug logging level is set because:
42
- # - If the project configuration was not provided this import is not needed.
43
- # - If the project configuration was provided, the error will be raised by PFClient.
44
- LOGGER.debug("promptflow.azure is not installed.")
37
+ raise EvaluationException( # pylint: disable=raise-missing-from
38
+ message=(
39
+ "The required packages for remote tracking are missing.\n"
40
+ 'To resolve this, please install them by running "pip install azure-ai-evaluation[remote]".'
41
+ ),
42
+ target=ErrorTarget.EVALUATE,
43
+ category=ErrorCategory.MISSING_PACKAGE,
44
+ blame=ErrorBlame.USER_ERROR,
45
+ )
45
46
 
46
47
 
47
48
  @dataclasses.dataclass
@@ -103,7 +104,6 @@ class EvalRun(contextlib.AbstractContextManager): # pylint: disable=too-many-in
103
104
  _SCOPE = "https://management.azure.com/.default"
104
105
 
105
106
  EVALUATION_ARTIFACT = "instance_results.jsonl"
106
- EVALUATION_ARTIFACT_DUMMY_RUN = "eval_results.jsonl"
107
107
 
108
108
  def __init__(
109
109
  self,
@@ -413,7 +413,7 @@ class EvalRun(contextlib.AbstractContextManager): # pylint: disable=too-many-in
413
413
  """
414
414
  if not self._check_state_and_log("log artifact", {RunStatus.BROKEN, RunStatus.NOT_STARTED}, False):
415
415
  return
416
- # Check if artifact dirrectory is empty or does not exist.
416
+ # Check if artifact directory is empty or does not exist.
417
417
  if not os.path.isdir(artifact_folder):
418
418
  LOGGER.warning("The path to the artifact is either not a directory or does not exist.")
419
419
  return
@@ -469,6 +469,7 @@ class EvalRun(contextlib.AbstractContextManager): # pylint: disable=too-many-in
469
469
  # we are rewriting already registered artifact and need to skip this step.
470
470
  if self._is_promptflow_run:
471
471
  return
472
+
472
473
  url = (
473
474
  f"https://{self._url_base}/artifact/v2.0/subscriptions/{self._subscription_id}"
474
475
  f"/resourceGroups/{self._resource_group_name}/providers/"
@@ -491,6 +492,29 @@ class EvalRun(contextlib.AbstractContextManager): # pylint: disable=too-many-in
491
492
  if response.status_code != 200:
492
493
  self._log_warning("register artifact", response)
493
494
 
495
+ # register artifacts for images if exists in image folder
496
+ try:
497
+ for remote_path in remote_paths["paths"]:
498
+ remote_file_path = remote_path["path"]
499
+ if "images" in os.path.normpath(remote_file_path).split(os.sep):
500
+ response = self.request_with_retry(
501
+ url=url,
502
+ method="POST",
503
+ json_dict={
504
+ "origin": "ExperimentRun",
505
+ "container": f"dcid.{self.info.run_id}",
506
+ "path": posixpath.join("images", os.path.basename(remote_file_path)),
507
+ "dataPath": {
508
+ "dataStoreName": datastore.name,
509
+ "relativePath": remote_file_path,
510
+ },
511
+ },
512
+ )
513
+ if response.status_code != 200:
514
+ self._log_warning("register image artifact", response)
515
+ except Exception as ex: # pylint: disable=broad-exception-caught
516
+ LOGGER.debug("Exception occurred while registering image artifact. ex: %s", ex)
517
+
494
518
  def _get_datastore_credential(self, datastore: "Datastore"):
495
519
  # Reference the logic in azure.ai.ml._artifact._artifact_utilities
496
520
  # https://github.com/Azure/azure-sdk-for-python/blob/main/sdk/ml/azure-ai-ml/azure/ai/ml/_artifacts/_artifact_utilities.py#L103