azure-ai-evaluation 1.0.0b5__py3-none-any.whl → 1.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of azure-ai-evaluation might be problematic. Click here for more details.

Files changed (72) hide show
  1. azure/ai/evaluation/_azure/__init__.py +3 -0
  2. azure/ai/evaluation/_azure/_clients.py +188 -0
  3. azure/ai/evaluation/_azure/_models.py +227 -0
  4. azure/ai/evaluation/_azure/_token_manager.py +118 -0
  5. azure/ai/evaluation/_common/_experimental.py +4 -0
  6. azure/ai/evaluation/_common/math.py +62 -2
  7. azure/ai/evaluation/_common/rai_service.py +110 -50
  8. azure/ai/evaluation/_common/utils.py +50 -16
  9. azure/ai/evaluation/_constants.py +2 -0
  10. azure/ai/evaluation/_evaluate/_batch_run/eval_run_context.py +9 -0
  11. azure/ai/evaluation/_evaluate/_batch_run/proxy_client.py +13 -3
  12. azure/ai/evaluation/_evaluate/_batch_run/target_run_context.py +12 -1
  13. azure/ai/evaluation/_evaluate/_eval_run.py +38 -43
  14. azure/ai/evaluation/_evaluate/_evaluate.py +62 -131
  15. azure/ai/evaluation/_evaluate/_telemetry/__init__.py +2 -1
  16. azure/ai/evaluation/_evaluate/_utils.py +72 -38
  17. azure/ai/evaluation/_evaluators/_bleu/_bleu.py +16 -17
  18. azure/ai/evaluation/_evaluators/_coherence/_coherence.py +60 -29
  19. azure/ai/evaluation/_evaluators/_common/_base_eval.py +88 -6
  20. azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +16 -3
  21. azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py +39 -10
  22. azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py +58 -52
  23. azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py +79 -34
  24. azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py +73 -34
  25. azure/ai/evaluation/_evaluators/_content_safety/_sexual.py +74 -33
  26. azure/ai/evaluation/_evaluators/_content_safety/_violence.py +76 -34
  27. azure/ai/evaluation/_evaluators/_eci/_eci.py +28 -3
  28. azure/ai/evaluation/_evaluators/_f1_score/_f1_score.py +20 -13
  29. azure/ai/evaluation/_evaluators/_fluency/_fluency.py +57 -26
  30. azure/ai/evaluation/_evaluators/_gleu/_gleu.py +13 -15
  31. azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +68 -30
  32. azure/ai/evaluation/_evaluators/_meteor/_meteor.py +17 -20
  33. azure/ai/evaluation/_evaluators/_multimodal/_content_safety_multimodal.py +10 -8
  34. azure/ai/evaluation/_evaluators/_multimodal/_content_safety_multimodal_base.py +0 -2
  35. azure/ai/evaluation/_evaluators/_multimodal/_hate_unfairness.py +6 -2
  36. azure/ai/evaluation/_evaluators/_multimodal/_protected_material.py +10 -6
  37. azure/ai/evaluation/_evaluators/_multimodal/_self_harm.py +6 -2
  38. azure/ai/evaluation/_evaluators/_multimodal/_sexual.py +6 -2
  39. azure/ai/evaluation/_evaluators/_multimodal/_violence.py +6 -2
  40. azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py +57 -34
  41. azure/ai/evaluation/_evaluators/_qa/_qa.py +25 -37
  42. azure/ai/evaluation/_evaluators/_relevance/_relevance.py +63 -29
  43. azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py +76 -161
  44. azure/ai/evaluation/_evaluators/_rouge/_rouge.py +24 -25
  45. azure/ai/evaluation/_evaluators/_service_groundedness/_service_groundedness.py +65 -67
  46. azure/ai/evaluation/_evaluators/_similarity/_similarity.py +26 -20
  47. azure/ai/evaluation/_evaluators/_xpia/xpia.py +74 -40
  48. azure/ai/evaluation/_exceptions.py +2 -0
  49. azure/ai/evaluation/_http_utils.py +6 -4
  50. azure/ai/evaluation/_model_configurations.py +65 -14
  51. azure/ai/evaluation/_vendor/rouge_score/rouge_scorer.py +0 -4
  52. azure/ai/evaluation/_vendor/rouge_score/scoring.py +0 -4
  53. azure/ai/evaluation/_vendor/rouge_score/tokenize.py +0 -4
  54. azure/ai/evaluation/_version.py +1 -1
  55. azure/ai/evaluation/simulator/_adversarial_scenario.py +17 -1
  56. azure/ai/evaluation/simulator/_adversarial_simulator.py +57 -47
  57. azure/ai/evaluation/simulator/_constants.py +11 -1
  58. azure/ai/evaluation/simulator/_conversation/__init__.py +128 -7
  59. azure/ai/evaluation/simulator/_conversation/_conversation.py +0 -1
  60. azure/ai/evaluation/simulator/_direct_attack_simulator.py +16 -8
  61. azure/ai/evaluation/simulator/_indirect_attack_simulator.py +12 -1
  62. azure/ai/evaluation/simulator/_model_tools/_identity_manager.py +3 -1
  63. azure/ai/evaluation/simulator/_model_tools/_rai_client.py +48 -4
  64. azure/ai/evaluation/simulator/_model_tools/_template_handler.py +1 -0
  65. azure/ai/evaluation/simulator/_simulator.py +54 -45
  66. azure/ai/evaluation/simulator/_utils.py +25 -7
  67. {azure_ai_evaluation-1.0.0b5.dist-info → azure_ai_evaluation-1.1.0.dist-info}/METADATA +240 -327
  68. {azure_ai_evaluation-1.0.0b5.dist-info → azure_ai_evaluation-1.1.0.dist-info}/RECORD +71 -68
  69. azure/ai/evaluation/_evaluators/_content_safety/_content_safety_chat.py +0 -322
  70. {azure_ai_evaluation-1.0.0b5.dist-info → azure_ai_evaluation-1.1.0.dist-info}/NOTICE.txt +0 -0
  71. {azure_ai_evaluation-1.0.0b5.dist-info → azure_ai_evaluation-1.1.0.dist-info}/WHEEL +0 -0
  72. {azure_ai_evaluation-1.0.0b5.dist-info → azure_ai_evaluation-1.1.0.dist-info}/top_level.txt +0 -0
@@ -6,6 +6,8 @@ import importlib.metadata
6
6
  import math
7
7
  import re
8
8
  import time
9
+ import json
10
+ import html
9
11
  from ast import literal_eval
10
12
  from typing import Dict, List, Optional, Union, cast
11
13
  from urllib.parse import urlparse
@@ -38,22 +40,53 @@ USER_AGENT = "{}/{}".format("azure-ai-evaluation", version)
38
40
 
39
41
  USER_TEXT_TEMPLATE_DICT: Dict[str, Template] = {
40
42
  "DEFAULT": Template("<Human>{$query}</><System>{$response}</>"),
41
- Tasks.GROUNDEDNESS: Template('{"question": "$query", "answer": "$response", "context": "$context"}'),
42
43
  }
43
44
 
44
45
 
45
- def get_common_headers(token: str) -> Dict:
46
+ def get_formatted_template(data: dict, annotation_task: str) -> str:
47
+ """Given the task and input data, produce a formatted string that will serve as the main
48
+ payload for the RAI service. Requires specific per-task logic.
49
+
50
+ :param data: The data to incorporate into the payload.
51
+ :type data: dict
52
+ :param annotation_task: The annotation task to use. This determines the template to use.
53
+ :type annotation_task: str
54
+ :return: The formatted based on the data and task template.
55
+ :rtype: str
56
+ """
57
+ # Template class doesn't play nice with json dumping/loading, just handle groundedness'
58
+ # JSON format manually.
59
+ # Template was: Template('{"question": "$query", "answer": "$response", "context": "$context"}'),
60
+ if annotation_task == Tasks.GROUNDEDNESS:
61
+ as_dict = {
62
+ "question": data.get("query", ""),
63
+ "answer": data.get("response", ""),
64
+ "context": data.get("context", ""),
65
+ }
66
+ return json.dumps(as_dict)
67
+ as_dict = {
68
+ "query": html.escape(data.get("query", "")),
69
+ "response": html.escape(data.get("response", "")),
70
+ }
71
+ user_text = USER_TEXT_TEMPLATE_DICT.get(annotation_task, USER_TEXT_TEMPLATE_DICT["DEFAULT"]).substitute(**as_dict)
72
+ return user_text.replace("'", '\\"')
73
+
74
+
75
+ def get_common_headers(token: str, evaluator_name: Optional[str] = None) -> Dict:
46
76
  """Get common headers for the HTTP request
47
77
 
48
78
  :param token: The Azure authentication token.
49
79
  :type token: str
80
+ :param evaluator_name: The evaluator name. Default is None.
81
+ :type evaluator_name: str
50
82
  :return: The common headers.
51
83
  :rtype: Dict
52
84
  """
85
+ user_agent = f"{USER_AGENT} (type=evaluator; subtype={evaluator_name})" if evaluator_name else USER_AGENT
53
86
  return {
54
87
  "Authorization": f"Bearer {token}",
55
88
  "Content-Type": "application/json",
56
- "User-Agent": USER_AGENT,
89
+ "User-Agent": user_agent,
57
90
  # Handle "RuntimeError: Event loop is closed" from httpx AsyncClient
58
91
  # https://github.com/encode/httpx/discussions/2959
59
92
  "Connection": "close",
@@ -83,27 +116,31 @@ async def ensure_service_availability(rai_svc_url: str, token: str, capability:
83
116
  async with get_async_http_client() as client:
84
117
  response = await client.get(svc_liveness_url, headers=headers)
85
118
 
86
- if response.status_code != 200:
87
- msg = f"RAI service is not available in this region. Status Code: {response.status_code}"
88
- raise EvaluationException(
89
- message=msg,
90
- internal_message=msg,
91
- target=ErrorTarget.UNKNOWN,
92
- category=ErrorCategory.SERVICE_UNAVAILABLE,
93
- blame=ErrorBlame.USER_ERROR,
94
- )
95
-
96
- capabilities = response.json()
119
+ if response.status_code != 200:
120
+ msg = (
121
+ f"RAI service is unavailable in this region, or you lack the necessary permissions "
122
+ f"to access the AI project. Status Code: {response.status_code}"
123
+ )
124
+ raise EvaluationException(
125
+ message=msg,
126
+ internal_message=msg,
127
+ target=ErrorTarget.RAI_CLIENT,
128
+ category=ErrorCategory.SERVICE_UNAVAILABLE,
129
+ blame=ErrorBlame.USER_ERROR,
130
+ tsg_link="https://aka.ms/azsdk/python/evaluation/safetyevaluator/troubleshoot",
131
+ )
97
132
 
98
- if capability and capability not in capabilities:
99
- msg = f"Capability '{capability}' is not available in this region"
100
- raise EvaluationException(
101
- message=msg,
102
- internal_message=msg,
103
- target=ErrorTarget.RAI_CLIENT,
104
- category=ErrorCategory.SERVICE_UNAVAILABLE,
105
- blame=ErrorBlame.USER_ERROR,
106
- )
133
+ capabilities = response.json()
134
+ if capability and capability not in capabilities:
135
+ msg = f"The needed capability '{capability}' is not supported by the RAI service in this region."
136
+ raise EvaluationException(
137
+ message=msg,
138
+ internal_message=msg,
139
+ target=ErrorTarget.RAI_CLIENT,
140
+ category=ErrorCategory.SERVICE_UNAVAILABLE,
141
+ blame=ErrorBlame.USER_ERROR,
142
+ tsg_link="https://aka.ms/azsdk/python/evaluation/safetyevaluator/troubleshoot",
143
+ )
107
144
 
108
145
 
109
146
  def generate_payload(normalized_user_text: str, metric: str, annotation_task: str) -> Dict:
@@ -141,7 +178,9 @@ def generate_payload(normalized_user_text: str, metric: str, annotation_task: st
141
178
  )
142
179
 
143
180
 
144
- async def submit_request(data: dict, metric: str, rai_svc_url: str, token: str, annotation_task: str) -> str:
181
+ async def submit_request(
182
+ data: dict, metric: str, rai_svc_url: str, token: str, annotation_task: str, evaluator_name: str
183
+ ) -> str:
145
184
  """Submit request to Responsible AI service for evaluation and return operation ID
146
185
 
147
186
  :param data: The data to evaluate.
@@ -154,15 +193,16 @@ async def submit_request(data: dict, metric: str, rai_svc_url: str, token: str,
154
193
  :type token: str
155
194
  :param annotation_task: The annotation task to use.
156
195
  :type annotation_task: str
196
+ :param evaluator_name: The evaluator name.
197
+ :type evaluator_name: str
157
198
  :return: The operation ID.
158
199
  :rtype: str
159
200
  """
160
- user_text = USER_TEXT_TEMPLATE_DICT.get(annotation_task, USER_TEXT_TEMPLATE_DICT["DEFAULT"]).substitute(**data)
161
- normalized_user_text = user_text.replace("'", '\\"')
201
+ normalized_user_text = get_formatted_template(data, annotation_task)
162
202
  payload = generate_payload(normalized_user_text, metric, annotation_task=annotation_task)
163
203
 
164
204
  url = rai_svc_url + "/submitannotation"
165
- headers = get_common_headers(token)
205
+ headers = get_common_headers(token, evaluator_name)
166
206
 
167
207
  async with get_async_http_client_with_timeout() as client:
168
208
  http_response = await client.post(url, json=payload, headers=headers)
@@ -235,13 +275,27 @@ def parse_response( # pylint: disable=too-many-branches,too-many-statements
235
275
  _InternalEvaluationMetrics.ECI,
236
276
  EvaluationMetrics.XPIA,
237
277
  }:
238
- if not batch_response or len(batch_response[0]) == 0 or metric_name not in batch_response[0]:
278
+ result = {}
279
+ if not batch_response or len(batch_response[0]) == 0:
280
+ return {}
281
+ if metric_name == EvaluationMetrics.PROTECTED_MATERIAL and metric_name not in batch_response[0]:
282
+ pm_metric_names = {"artwork", "fictional_characters", "logos_and_brands"}
283
+ for pm_metric_name in pm_metric_names:
284
+ response = batch_response[0][pm_metric_name]
285
+ response = response.replace("false", "False")
286
+ response = response.replace("true", "True")
287
+ parsed_response = literal_eval(response)
288
+ result[pm_metric_name + "_label"] = parsed_response["label"] if "label" in parsed_response else math.nan
289
+ result[pm_metric_name + "_reason"] = (
290
+ parsed_response["reasoning"] if "reasoning" in parsed_response else ""
291
+ )
292
+ return result
293
+ if metric_name not in batch_response[0]:
239
294
  return {}
240
295
  response = batch_response[0][metric_name]
241
296
  response = response.replace("false", "False")
242
297
  response = response.replace("true", "True")
243
298
  parsed_response = literal_eval(response)
244
- result = {}
245
299
  # Use label instead of score since these are assumed to be boolean results.
246
300
  # Use math.nan as null value since it's ignored by aggregations rather than treated as 0.
247
301
  result[metric_display_name + "_label"] = parsed_response["label"] if "label" in parsed_response else math.nan
@@ -371,13 +425,17 @@ async def _get_service_discovery_url(azure_ai_project: AzureAIProject, token: st
371
425
  )
372
426
 
373
427
  if response.status_code != 200:
374
- msg = "Failed to retrieve the discovery service URL."
428
+ msg = (
429
+ f"Failed to connect to your Azure AI project. Please check if the project scope is configured correctly, "
430
+ f"and make sure you have the necessary access permissions. "
431
+ f"Status code: {response.status_code}."
432
+ )
375
433
  raise EvaluationException(
376
434
  message=msg,
377
- internal_message=msg,
378
435
  target=ErrorTarget.RAI_CLIENT,
379
- category=ErrorCategory.SERVICE_UNAVAILABLE,
380
- blame=ErrorBlame.UNKNOWN,
436
+ blame=ErrorBlame.USER_ERROR,
437
+ category=ErrorCategory.PROJECT_ACCESS_ERROR,
438
+ tsg_link="https://aka.ms/azsdk/python/evaluation/safetyevaluator/troubleshoot",
381
439
  )
382
440
 
383
441
  base_url = urlparse(response.json()["properties"]["discoveryUrl"])
@@ -442,24 +500,26 @@ async def evaluate_with_rai_service(
442
500
  credential: TokenCredential,
443
501
  annotation_task: str = Tasks.CONTENT_HARM,
444
502
  metric_display_name=None,
503
+ evaluator_name=None,
445
504
  ) -> Dict[str, Union[str, float]]:
446
- """ "Evaluate the content safety of the response using Responsible AI service
505
+ """Evaluate the content safety of the response using Responsible AI service
447
506
 
448
- :param data: The data to evaluate.
449
- :type data: dict
450
- :param metric_name: The evaluation metric to use.
451
- :type metric_name: str
452
- :param project_scope: The Azure AI project scope details.
453
- :type project_scope: Dict
454
- :param credential: The Azure authentication credential.
455
- :type credential:
456
- ~azure.core.credentials.TokenCredential
457
- :param annotation_task: The annotation task to use.
458
- :type annotation_task: str
459
- :param metric_display_name: The display name of metric to use.
460
- :type metric_display_name: str
461
- :return: The parsed annotation result.
462
- :rtype: Dict[str, Union[str, float]]
507
+ :param data: The data to evaluate.
508
+ :type data: dict
509
+ :param metric_name: The evaluation metric to use.
510
+ :type metric_name: str
511
+ :param project_scope: The Azure AI project scope details.
512
+ :type project_scope: Dict
513
+ :param credential: The Azure authentication credential.
514
+ :type credential: ~azure.core.credentials.TokenCredential
515
+ :param annotation_task: The annotation task to use.
516
+ :type annotation_task: str
517
+ :param metric_display_name: The display name of metric to use.
518
+ :type metric_display_name: str
519
+ :param evaluator_name: The evaluator name to use.
520
+ :type evaluator_name: str
521
+ :return: The parsed annotation result.
522
+ :rtype: Dict[str, Union[str, float]]
463
523
  """
464
524
 
465
525
  # Get RAI service URL from discovery service and check service availability
@@ -468,7 +528,7 @@ async def evaluate_with_rai_service(
468
528
  await ensure_service_availability(rai_svc_url, token, annotation_task)
469
529
 
470
530
  # Submit annotation request and fetch result
471
- operation_id = await submit_request(data, metric_name, rai_svc_url, token, annotation_task)
531
+ operation_id = await submit_request(data, metric_name, rai_svc_url, token, annotation_task, evaluator_name)
472
532
  annotation_response = cast(List[Dict], await fetch_result(operation_id, rai_svc_url, credential, token))
473
533
  result = parse_response(annotation_response, metric_name, metric_display_name)
474
534
 
@@ -293,14 +293,22 @@ def parse_quality_evaluator_reason_score(llm_output: str) -> Tuple[float, str]:
293
293
  score = math.nan
294
294
  reason = ""
295
295
  if llm_output:
296
- score_pattern = r"<S2>(.*?)</S2>"
297
- reason_pattern = r"<S1>(.*?)</S1>"
298
- score_match = re.findall(score_pattern, llm_output, re.DOTALL)
299
- reason_match = re.findall(reason_pattern, llm_output, re.DOTALL)
300
- if score_match:
301
- score = float(score_match[0].strip())
302
- if reason_match:
303
- reason = reason_match[0].strip()
296
+ try:
297
+ score_pattern = r"<S2>\D*?([1-5]).*?</S2>"
298
+ reason_pattern = r"<S1>(.*?)</S1>"
299
+ score_match = re.findall(score_pattern, llm_output, re.DOTALL)
300
+ reason_match = re.findall(reason_pattern, llm_output, re.DOTALL)
301
+ if score_match:
302
+ score = float(score_match[0].strip())
303
+ if reason_match:
304
+ reason = reason_match[0].strip()
305
+ except ValueError as exc:
306
+ raise EvaluationException(
307
+ message=f"Failed to parse model output: \n{llm_output}",
308
+ internal_message="Failed to parse model output.",
309
+ category=ErrorCategory.FAILED_EXECUTION,
310
+ blame=ErrorBlame.SYSTEM_ERROR,
311
+ ) from exc
304
312
 
305
313
  return score, reason
306
314
 
@@ -329,12 +337,12 @@ def retrieve_content_type(assistant_messages: List, metric: str) -> str:
329
337
  return "image"
330
338
 
331
339
  # Iterate through each message
332
- for item in assistant_messages:
340
+ for message in assistant_messages:
333
341
  # Ensure "content" exists in the message and is iterable
334
- content = item.get("content", [])
335
- for message in content:
336
- if message.get("type", "") == "image_url":
337
- return "image"
342
+ if isinstance(message.get("content", []), list):
343
+ for content in message.get("content", []):
344
+ if content.get("type") == "image_url":
345
+ return "image"
338
346
  # Default return if no image was found
339
347
  return "text"
340
348
 
@@ -362,6 +370,8 @@ def validate_conversation(conversation):
362
370
  )
363
371
  expected_roles = {"user", "assistant", "system"}
364
372
  image_found = False
373
+ assistant_message_count = 0
374
+ user_message_count = 0
365
375
  for num, message in enumerate(messages, 1):
366
376
  if not isinstance(message, dict):
367
377
  try:
@@ -374,17 +384,21 @@ def validate_conversation(conversation):
374
384
  )
375
385
  except ImportError as ex:
376
386
  raise MissingRequiredPackage(
377
- message="Please install 'azure-ai-inference' package to use SystemMessage, AssistantMessage"
387
+ message="Please install 'azure-ai-inference' package to use SystemMessage, "
388
+ "UserMessage or AssistantMessage."
378
389
  ) from ex
379
390
 
380
- if isinstance(messages[0], ChatRequestMessage) and not isinstance(
391
+ if isinstance(message, ChatRequestMessage) and not isinstance(
381
392
  message, (UserMessage, AssistantMessage, SystemMessage)
382
393
  ):
383
394
  raise_exception(
384
395
  f"Messages must be a strongly typed class of ChatRequestMessage. Message number: {num}",
385
396
  ErrorTarget.CONTENT_SAFETY_MULTIMODAL_EVALUATOR,
386
397
  )
387
-
398
+ if isinstance(message, AssistantMessage):
399
+ assistant_message_count += 1
400
+ if isinstance(message, UserMessage):
401
+ user_message_count += 1
388
402
  if isinstance(message.content, list) and any(
389
403
  isinstance(item, ImageContentItem) for item in message.content
390
404
  ):
@@ -395,6 +409,10 @@ def validate_conversation(conversation):
395
409
  f"Invalid role provided: {message.get('role')}. Message number: {num}",
396
410
  ErrorTarget.CONTENT_SAFETY_MULTIMODAL_EVALUATOR,
397
411
  )
412
+ if message.get("role") == "assistant":
413
+ assistant_message_count += 1
414
+ if message.get("role") == "user":
415
+ user_message_count += 1
398
416
  content = message.get("content")
399
417
  if not isinstance(content, (str, list)):
400
418
  raise_exception(
@@ -409,3 +427,19 @@ def validate_conversation(conversation):
409
427
  "Message needs to have multi-modal input like images.",
410
428
  ErrorTarget.CONTENT_SAFETY_MULTIMODAL_EVALUATOR,
411
429
  )
430
+ if assistant_message_count == 0:
431
+ raise_exception(
432
+ "Assistant role required in one of the messages.",
433
+ ErrorTarget.CONTENT_SAFETY_MULTIMODAL_EVALUATOR,
434
+ )
435
+ if user_message_count == 0:
436
+ raise_exception(
437
+ "User role required in one of the messages.",
438
+ ErrorTarget.CONTENT_SAFETY_MULTIMODAL_EVALUATOR,
439
+ )
440
+ if assistant_message_count > 1:
441
+ raise_exception(
442
+ "Evaluators for multimodal conversations only support single turn. "
443
+ "User and assistant role expected as the only role in each message.",
444
+ ErrorTarget.CONTENT_SAFETY_MULTIMODAL_EVALUATOR,
445
+ )
@@ -54,6 +54,7 @@ class EvaluationRunProperties:
54
54
 
55
55
  RUN_TYPE = "runType"
56
56
  EVALUATION_RUN = "_azureml.evaluation_run"
57
+ EVALUATION_SDK = "_azureml.evaluation_sdk_name"
57
58
 
58
59
 
59
60
  DEFAULT_EVALUATION_RESULTS_FILE_NAME = "evaluation_results.json"
@@ -62,6 +63,7 @@ CONTENT_SAFETY_DEFECT_RATE_THRESHOLD_DEFAULT = 4
62
63
 
63
64
  PF_BATCH_TIMEOUT_SEC_DEFAULT = 3600
64
65
  PF_BATCH_TIMEOUT_SEC = "PF_BATCH_TIMEOUT_SEC"
66
+ PF_DISABLE_TRACING = "PF_DISABLE_TRACING"
65
67
 
66
68
  OTEL_EXPORTER_OTLP_TRACES_TIMEOUT = "OTEL_EXPORTER_OTLP_TRACES_TIMEOUT"
67
69
  OTEL_EXPORTER_OTLP_TRACES_TIMEOUT_DEFAULT = 60
@@ -14,6 +14,7 @@ from azure.ai.evaluation._constants import (
14
14
  OTEL_EXPORTER_OTLP_TRACES_TIMEOUT_DEFAULT,
15
15
  PF_BATCH_TIMEOUT_SEC,
16
16
  PF_BATCH_TIMEOUT_SEC_DEFAULT,
17
+ PF_DISABLE_TRACING,
17
18
  )
18
19
 
19
20
  from ..._user_agent import USER_AGENT
@@ -36,8 +37,12 @@ class EvalRunContext:
36
37
  self.client = client
37
38
  self._is_batch_timeout_set_by_system = False
38
39
  self._is_otel_timeout_set_by_system = False
40
+ self._original_cwd = os.getcwd()
39
41
 
40
42
  def __enter__(self) -> None:
43
+ # Preserve current working directory, as PF may change it without restoring it afterward
44
+ self._original_cwd = os.getcwd()
45
+
41
46
  if isinstance(self.client, CodeClient):
42
47
  ClientUserAgentUtil.append_user_agent(USER_AGENT)
43
48
  inject_openai_api()
@@ -45,6 +50,7 @@ class EvalRunContext:
45
50
  if isinstance(self.client, ProxyClient):
46
51
  os.environ[PF_FLOW_ENTRY_IN_TMP] = "true"
47
52
  os.environ[PF_FLOW_META_LOAD_IN_SUBPROCESS] = "false"
53
+ os.environ[PF_DISABLE_TRACING] = "true"
48
54
 
49
55
  if os.environ.get(PF_BATCH_TIMEOUT_SEC) is None:
50
56
  os.environ[PF_BATCH_TIMEOUT_SEC] = str(PF_BATCH_TIMEOUT_SEC_DEFAULT)
@@ -64,12 +70,15 @@ class EvalRunContext:
64
70
  exc_value: Optional[BaseException],
65
71
  exc_tb: Optional[types.TracebackType],
66
72
  ) -> None:
73
+ os.chdir(self._original_cwd)
74
+
67
75
  if isinstance(self.client, CodeClient):
68
76
  recover_openai_api()
69
77
 
70
78
  if isinstance(self.client, ProxyClient):
71
79
  os.environ.pop(PF_FLOW_ENTRY_IN_TMP, None)
72
80
  os.environ.pop(PF_FLOW_META_LOAD_IN_SUBPROCESS, None)
81
+ os.environ.pop(PF_DISABLE_TRACING, None)
73
82
 
74
83
  if self._is_batch_timeout_set_by_system:
75
84
  os.environ.pop(PF_BATCH_TIMEOUT_SEC, None)
@@ -68,12 +68,22 @@ class ProxyClient: # pylint: disable=client-accepts-api-version-keyword
68
68
  run = proxy_run.run.result()
69
69
 
70
70
  # pylint: disable=protected-access
71
+ completed_lines = run._properties.get("system_metrics", {}).get("__pf__.lines.completed", "NA")
72
+ failed_lines = run._properties.get("system_metrics", {}).get("__pf__.lines.failed", "NA")
73
+
74
+ # Update status to "Completed with Errors" if the original status is "Completed" and there are failed lines
75
+ if run.status == "Completed" and failed_lines != "NA" and int(failed_lines) > 0:
76
+ status = "Completed with Errors"
77
+ else:
78
+ status = run.status
79
+
80
+ # Return the ordered dictionary with the updated status
71
81
  return OrderedDict(
72
82
  [
73
- ("status", run.status),
83
+ ("status", status),
74
84
  ("duration", str(run._end_time - run._created_on)),
75
- ("completed_lines", run._properties.get("system_metrics", {}).get("__pf__.lines.completed", "NA")),
76
- ("failed_lines", run._properties.get("system_metrics", {}).get("__pf__.lines.failed", "NA")),
85
+ ("completed_lines", completed_lines),
86
+ ("failed_lines", failed_lines),
77
87
  ("log_path", str(run._output_path)),
78
88
  ]
79
89
  )
@@ -6,6 +6,7 @@ import types
6
6
  from typing import Optional, Type
7
7
 
8
8
  from promptflow._sdk._constants import PF_FLOW_ENTRY_IN_TMP
9
+ from azure.ai.evaluation._constants import PF_DISABLE_TRACING
9
10
 
10
11
 
11
12
  class TargetRunContext:
@@ -15,21 +16,31 @@ class TargetRunContext:
15
16
  :type upload_snapshot: bool
16
17
  """
17
18
 
18
- def __init__(self, upload_snapshot: bool) -> None:
19
+ def __init__(self, upload_snapshot: bool = False) -> None:
19
20
  self._upload_snapshot = upload_snapshot
21
+ self._original_cwd = os.getcwd()
20
22
 
21
23
  def __enter__(self) -> None:
24
+ # Preserve current working directory, as PF may change it without restoring it afterward
25
+ self._original_cwd = os.getcwd()
26
+
22
27
  # Address "[WinError 32] The process cannot access the file" error,
23
28
  # caused by conflicts when the venv and target function are in the same directory.
24
29
  # Setting PF_FLOW_ENTRY_IN_TMP to true uploads only the flex entry file (flow.flex.yaml).
25
30
  if not self._upload_snapshot:
26
31
  os.environ[PF_FLOW_ENTRY_IN_TMP] = "true"
27
32
 
33
+ os.environ[PF_DISABLE_TRACING] = "true"
34
+
28
35
  def __exit__(
29
36
  self,
30
37
  exc_type: Optional[Type[BaseException]],
31
38
  exc_value: Optional[BaseException],
32
39
  exc_tb: Optional[types.TracebackType],
33
40
  ) -> None:
41
+ os.chdir(self._original_cwd)
42
+
34
43
  if not self._upload_snapshot:
35
44
  os.environ.pop(PF_FLOW_ENTRY_IN_TMP, None)
45
+
46
+ os.environ.pop(PF_DISABLE_TRACING, None)
@@ -22,28 +22,12 @@ from azure.ai.evaluation._version import VERSION
22
22
  from azure.core.pipeline.policies import RetryPolicy
23
23
  from azure.core.rest import HttpResponse
24
24
  from azure.core.exceptions import HttpResponseError
25
+ from azure.storage.blob import BlobServiceClient
26
+ from azure.ai.evaluation._azure._clients import LiteMLClient
25
27
 
26
28
  LOGGER = logging.getLogger(__name__)
27
29
 
28
30
 
29
- # Handle optional import. The azure libraries are only present if
30
- # promptflow-azure is installed.
31
- try:
32
- from azure.ai.ml import MLClient
33
- from azure.ai.ml.entities._credentials import AccountKeyConfiguration # pylint: disable=ungrouped-imports
34
- from azure.ai.ml.entities._datastore.datastore import Datastore
35
- from azure.storage.blob import BlobServiceClient
36
- except (ModuleNotFoundError, ImportError):
37
- # If the above mentioned modules cannot be imported, we are running
38
- # in local mode and MLClient in the constructor will be None, so
39
- # we will not arrive to Azure-dependent code.
40
-
41
- # We are logging the import failure only if debug logging level is set because:
42
- # - If the project configuration was not provided this import is not needed.
43
- # - If the project configuration was provided, the error will be raised by PFClient.
44
- LOGGER.debug("promptflow.azure is not installed.")
45
-
46
-
47
31
  @dataclasses.dataclass
48
32
  class RunInfo:
49
33
  """
@@ -92,18 +76,18 @@ class EvalRun(contextlib.AbstractContextManager): # pylint: disable=too-many-in
92
76
  :type group_name: str
93
77
  :param workspace_name: The name of workspace/project used to track run.
94
78
  :type workspace_name: str
95
- :param ml_client: The ml client used for authentication into Azure.
96
- :type ml_client: azure.ai.ml.MLClient
79
+ :param management_client: The trace destination string to parse the AI ML workspace blob store from.
80
+ :type management_client:
81
+ ~azure.ai.evaluation._promptflow.azure._lite_azure_management_client.LiteMLClient
97
82
  :param promptflow_run: The promptflow run used by the
83
+ :type promptflow_run: Optional[promptflow._sdk.entities.Run]
98
84
  """
99
85
 
100
86
  _MAX_RETRIES = 5
101
87
  _BACKOFF_FACTOR = 2
102
88
  _TIMEOUT = 5
103
- _SCOPE = "https://management.azure.com/.default"
104
89
 
105
90
  EVALUATION_ARTIFACT = "instance_results.jsonl"
106
- EVALUATION_ARTIFACT_DUMMY_RUN = "eval_results.jsonl"
107
91
 
108
92
  def __init__(
109
93
  self,
@@ -112,14 +96,14 @@ class EvalRun(contextlib.AbstractContextManager): # pylint: disable=too-many-in
112
96
  subscription_id: str,
113
97
  group_name: str,
114
98
  workspace_name: str,
115
- ml_client: "MLClient",
99
+ management_client: LiteMLClient,
116
100
  promptflow_run: Optional[Run] = None,
117
101
  ) -> None:
118
102
  self._tracking_uri: str = tracking_uri
119
103
  self._subscription_id: str = subscription_id
120
104
  self._resource_group_name: str = group_name
121
105
  self._workspace_name: str = workspace_name
122
- self._ml_client: Any = ml_client
106
+ self._management_client: LiteMLClient = management_client
123
107
  self._is_promptflow_run: bool = promptflow_run is not None
124
108
  self._run_name = run_name
125
109
  self._promptflow_run = promptflow_run
@@ -184,7 +168,7 @@ class EvalRun(contextlib.AbstractContextManager): # pylint: disable=too-many-in
184
168
  if self._promptflow_run is not None:
185
169
  self._info = RunInfo(
186
170
  self._promptflow_run.name,
187
- self._promptflow_run._experiment_name, # pylint: disable=protected-access
171
+ self._promptflow_run._experiment_name or "", # pylint: disable=protected-access
188
172
  self._promptflow_run.name,
189
173
  )
190
174
  else:
@@ -310,12 +294,8 @@ class EvalRun(contextlib.AbstractContextManager): # pylint: disable=too-many-in
310
294
  """
311
295
  return f"https://{self._url_base}" "/mlflow/v2.0" f"{self._get_scope()}" f"/api/2.0/mlflow/runs/log-metric"
312
296
 
313
- def _get_token(self):
314
- # We have to use lazy import because promptflow.azure
315
- # is an optional dependency.
316
- from promptflow.azure._utils._token_cache import ArmTokenCache # pylint: disable=import-error,no-name-in-module
317
-
318
- return ArmTokenCache().get_token(self._ml_client._credential) # pylint: disable=protected-access
297
+ def _get_token(self) -> str:
298
+ return self._management_client.get_token()
319
299
 
320
300
  def request_with_retry(
321
301
  self, url: str, method: str, json_dict: Dict[str, Any], headers: Optional[Dict[str, str]] = None
@@ -413,7 +393,7 @@ class EvalRun(contextlib.AbstractContextManager): # pylint: disable=too-many-in
413
393
  """
414
394
  if not self._check_state_and_log("log artifact", {RunStatus.BROKEN, RunStatus.NOT_STARTED}, False):
415
395
  return
416
- # Check if artifact dirrectory is empty or does not exist.
396
+ # Check if artifact directory is empty or does not exist.
417
397
  if not os.path.isdir(artifact_folder):
418
398
  LOGGER.warning("The path to the artifact is either not a directory or does not exist.")
419
399
  return
@@ -441,9 +421,10 @@ class EvalRun(contextlib.AbstractContextManager): # pylint: disable=too-many-in
441
421
  local_paths.append(local_file_path)
442
422
 
443
423
  # We will write the artifacts to the workspaceblobstore
444
- datastore = self._ml_client.datastores.get_default(include_secrets=True)
424
+ datastore = self._management_client.workspace_get_default_datastore(self._workspace_name, True)
445
425
  account_url = f"{datastore.account_name}.blob.{datastore.endpoint}"
446
- svc_client = BlobServiceClient(account_url=account_url, credential=self._get_datastore_credential(datastore))
426
+
427
+ svc_client = BlobServiceClient(account_url=account_url, credential=datastore.credential)
447
428
  try:
448
429
  for local, remote in zip(local_paths, remote_paths["paths"]):
449
430
  blob_client = svc_client.get_blob_client(container=datastore.container_name, blob=remote["path"])
@@ -469,6 +450,7 @@ class EvalRun(contextlib.AbstractContextManager): # pylint: disable=too-many-in
469
450
  # we are rewriting already registered artifact and need to skip this step.
470
451
  if self._is_promptflow_run:
471
452
  return
453
+
472
454
  url = (
473
455
  f"https://{self._url_base}/artifact/v2.0/subscriptions/{self._subscription_id}"
474
456
  f"/resourceGroups/{self._resource_group_name}/providers/"
@@ -491,15 +473,28 @@ class EvalRun(contextlib.AbstractContextManager): # pylint: disable=too-many-in
491
473
  if response.status_code != 200:
492
474
  self._log_warning("register artifact", response)
493
475
 
494
- def _get_datastore_credential(self, datastore: "Datastore"):
495
- # Reference the logic in azure.ai.ml._artifact._artifact_utilities
496
- # https://github.com/Azure/azure-sdk-for-python/blob/main/sdk/ml/azure-ai-ml/azure/ai/ml/_artifacts/_artifact_utilities.py#L103
497
- credential = datastore.credentials
498
- if isinstance(credential, AccountKeyConfiguration):
499
- return credential.account_key
500
- if hasattr(credential, "sas_token"):
501
- return credential.sas_token
502
- return self._ml_client.datastores._credential # pylint: disable=protected-access
476
+ # register artifacts for images if exists in image folder
477
+ try:
478
+ for remote_path in remote_paths["paths"]:
479
+ remote_file_path = remote_path["path"]
480
+ if "images" in os.path.normpath(remote_file_path).split(os.sep):
481
+ response = self.request_with_retry(
482
+ url=url,
483
+ method="POST",
484
+ json_dict={
485
+ "origin": "ExperimentRun",
486
+ "container": f"dcid.{self.info.run_id}",
487
+ "path": posixpath.join("images", os.path.basename(remote_file_path)),
488
+ "dataPath": {
489
+ "dataStoreName": datastore.name,
490
+ "relativePath": remote_file_path,
491
+ },
492
+ },
493
+ )
494
+ if response.status_code != 200:
495
+ self._log_warning("register image artifact", response)
496
+ except Exception as ex: # pylint: disable=broad-exception-caught
497
+ LOGGER.debug("Exception occurred while registering image artifact. ex: %s", ex)
503
498
 
504
499
  def log_metric(self, key: str, value: float) -> None:
505
500
  """