azure-ai-evaluation 1.0.0b4__py3-none-any.whl → 1.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (83) hide show
  1. azure/ai/evaluation/__init__.py +22 -0
  2. azure/ai/evaluation/{simulator/_helpers → _common}/_experimental.py +4 -0
  3. azure/ai/evaluation/_common/constants.py +5 -0
  4. azure/ai/evaluation/_common/math.py +73 -2
  5. azure/ai/evaluation/_common/rai_service.py +250 -62
  6. azure/ai/evaluation/_common/utils.py +196 -23
  7. azure/ai/evaluation/_constants.py +7 -6
  8. azure/ai/evaluation/_evaluate/{_batch_run_client → _batch_run}/__init__.py +3 -2
  9. azure/ai/evaluation/_evaluate/{_batch_run_client/batch_run_context.py → _batch_run/eval_run_context.py} +13 -4
  10. azure/ai/evaluation/_evaluate/{_batch_run_client → _batch_run}/proxy_client.py +19 -6
  11. azure/ai/evaluation/_evaluate/_batch_run/target_run_context.py +46 -0
  12. azure/ai/evaluation/_evaluate/_eval_run.py +55 -14
  13. azure/ai/evaluation/_evaluate/_evaluate.py +312 -228
  14. azure/ai/evaluation/_evaluate/_telemetry/__init__.py +7 -6
  15. azure/ai/evaluation/_evaluate/_utils.py +46 -11
  16. azure/ai/evaluation/_evaluators/_bleu/_bleu.py +17 -18
  17. azure/ai/evaluation/_evaluators/_coherence/_coherence.py +67 -31
  18. azure/ai/evaluation/_evaluators/_coherence/coherence.prompty +76 -34
  19. azure/ai/evaluation/_evaluators/_common/_base_eval.py +37 -24
  20. azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +21 -9
  21. azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py +52 -16
  22. azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py +91 -48
  23. azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py +100 -26
  24. azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py +94 -26
  25. azure/ai/evaluation/_evaluators/_content_safety/_sexual.py +96 -26
  26. azure/ai/evaluation/_evaluators/_content_safety/_violence.py +97 -26
  27. azure/ai/evaluation/_evaluators/_eci/_eci.py +31 -4
  28. azure/ai/evaluation/_evaluators/_f1_score/_f1_score.py +20 -13
  29. azure/ai/evaluation/_evaluators/_fluency/_fluency.py +67 -36
  30. azure/ai/evaluation/_evaluators/_fluency/fluency.prompty +66 -36
  31. azure/ai/evaluation/_evaluators/_gleu/_gleu.py +14 -16
  32. azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +106 -34
  33. azure/ai/evaluation/_evaluators/_groundedness/groundedness_with_query.prompty +113 -0
  34. azure/ai/evaluation/_evaluators/_groundedness/groundedness_without_query.prompty +99 -0
  35. azure/ai/evaluation/_evaluators/_meteor/_meteor.py +20 -27
  36. azure/ai/evaluation/_evaluators/_multimodal/__init__.py +20 -0
  37. azure/ai/evaluation/_evaluators/_multimodal/_content_safety_multimodal.py +132 -0
  38. azure/ai/evaluation/_evaluators/_multimodal/_content_safety_multimodal_base.py +55 -0
  39. azure/ai/evaluation/_evaluators/_multimodal/_hate_unfairness.py +100 -0
  40. azure/ai/evaluation/_evaluators/_multimodal/_protected_material.py +124 -0
  41. azure/ai/evaluation/_evaluators/_multimodal/_self_harm.py +100 -0
  42. azure/ai/evaluation/_evaluators/_multimodal/_sexual.py +100 -0
  43. azure/ai/evaluation/_evaluators/_multimodal/_violence.py +100 -0
  44. azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py +87 -31
  45. azure/ai/evaluation/_evaluators/_qa/_qa.py +23 -31
  46. azure/ai/evaluation/_evaluators/_relevance/_relevance.py +72 -36
  47. azure/ai/evaluation/_evaluators/_relevance/relevance.prompty +78 -42
  48. azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py +83 -125
  49. azure/ai/evaluation/_evaluators/_retrieval/retrieval.prompty +74 -24
  50. azure/ai/evaluation/_evaluators/_rouge/_rouge.py +26 -27
  51. azure/ai/evaluation/_evaluators/_service_groundedness/__init__.py +9 -0
  52. azure/ai/evaluation/_evaluators/_service_groundedness/_service_groundedness.py +148 -0
  53. azure/ai/evaluation/_evaluators/_similarity/_similarity.py +37 -28
  54. azure/ai/evaluation/_evaluators/_xpia/xpia.py +94 -33
  55. azure/ai/evaluation/_exceptions.py +19 -0
  56. azure/ai/evaluation/_model_configurations.py +83 -15
  57. azure/ai/evaluation/_version.py +1 -1
  58. azure/ai/evaluation/simulator/__init__.py +2 -1
  59. azure/ai/evaluation/simulator/_adversarial_scenario.py +20 -1
  60. azure/ai/evaluation/simulator/_adversarial_simulator.py +29 -35
  61. azure/ai/evaluation/simulator/_constants.py +11 -1
  62. azure/ai/evaluation/simulator/_data_sources/__init__.py +3 -0
  63. azure/ai/evaluation/simulator/_data_sources/grounding.json +1150 -0
  64. azure/ai/evaluation/simulator/_direct_attack_simulator.py +17 -9
  65. azure/ai/evaluation/simulator/_helpers/__init__.py +1 -2
  66. azure/ai/evaluation/simulator/_helpers/_simulator_data_classes.py +22 -1
  67. azure/ai/evaluation/simulator/_indirect_attack_simulator.py +90 -35
  68. azure/ai/evaluation/simulator/_model_tools/_identity_manager.py +4 -2
  69. azure/ai/evaluation/simulator/_model_tools/_rai_client.py +8 -4
  70. azure/ai/evaluation/simulator/_prompty/task_query_response.prompty +4 -4
  71. azure/ai/evaluation/simulator/_prompty/task_simulate.prompty +6 -1
  72. azure/ai/evaluation/simulator/_simulator.py +165 -105
  73. azure/ai/evaluation/simulator/_utils.py +31 -13
  74. azure_ai_evaluation-1.0.1.dist-info/METADATA +600 -0
  75. {azure_ai_evaluation-1.0.0b4.dist-info → azure_ai_evaluation-1.0.1.dist-info}/NOTICE.txt +20 -0
  76. azure_ai_evaluation-1.0.1.dist-info/RECORD +119 -0
  77. {azure_ai_evaluation-1.0.0b4.dist-info → azure_ai_evaluation-1.0.1.dist-info}/WHEEL +1 -1
  78. azure/ai/evaluation/_evaluators/_content_safety/_content_safety_chat.py +0 -322
  79. azure/ai/evaluation/_evaluators/_groundedness/groundedness.prompty +0 -49
  80. azure_ai_evaluation-1.0.0b4.dist-info/METADATA +0 -535
  81. azure_ai_evaluation-1.0.0b4.dist-info/RECORD +0 -106
  82. /azure/ai/evaluation/_evaluate/{_batch_run_client → _batch_run}/code_client.py +0 -0
  83. {azure_ai_evaluation-1.0.0b4.dist-info → azure_ai_evaluation-1.0.1.dist-info}/top_level.txt +0 -0
@@ -2,13 +2,14 @@
2
2
  # Copyright (c) Microsoft Corporation. All rights reserved.
3
3
  # ---------------------------------------------------------
4
4
 
5
+ import re
5
6
  import math
6
7
  import threading
7
- from typing import Any, List, Literal, Mapping, Type, TypeVar, Union, cast, get_args, get_origin
8
+ from typing import Any, List, Literal, Mapping, Type, TypeVar, Tuple, Union, cast, get_args, get_origin
8
9
 
9
10
  import nltk
10
11
  from typing_extensions import NotRequired, Required, TypeGuard
11
-
12
+ from promptflow.core._errors import MissingRequiredPackage
12
13
  from azure.ai.evaluation._constants import AZURE_OPENAI_TYPE, OPENAI_TYPE
13
14
  from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
14
15
  from azure.ai.evaluation._model_configurations import (
@@ -24,7 +25,7 @@ _nltk_data_download_lock = threading.Lock()
24
25
  T_TypedDict = TypeVar("T_TypedDict", bound=Mapping[Any, Any])
25
26
 
26
27
 
27
- def get_harm_severity_level(harm_score: int) -> Union[str, float]:
28
+ def get_harm_severity_level(harm_score: Union[float, int]) -> Union[str, float]:
28
29
  """Generate harm severity level based on harm score.
29
30
 
30
31
  :param harm_score: The harm score to be evaluated.
@@ -48,13 +49,19 @@ def get_harm_severity_level(harm_score: int) -> Union[str, float]:
48
49
 
49
50
  def ensure_nltk_data_downloaded():
50
51
  """Download NLTK data packages if not already downloaded."""
52
+ nltk_data = [
53
+ ("wordnet", "corpora/wordnet.zip"),
54
+ ("perluniprops", "misc/perluniprops.zip"),
55
+ ("punkt", "tokenizers/punkt.zip"),
56
+ ("punkt_tab", "tokenizers/punkt_tab.zip"),
57
+ ]
58
+
51
59
  with _nltk_data_download_lock:
52
- try:
53
- from nltk.tokenize.nist import NISTTokenizer # pylint: disable=unused-import
54
- except LookupError:
55
- nltk.download("perluniprops")
56
- nltk.download("punkt")
57
- nltk.download("punkt_tab")
60
+ for _id, resource_name in nltk_data:
61
+ try:
62
+ nltk.find(resource_name)
63
+ except LookupError:
64
+ nltk.download(_id)
58
65
 
59
66
 
60
67
  def nltk_tokenize(text: str) -> List[str]:
@@ -122,24 +129,23 @@ def validate_azure_ai_project(o: object) -> AzureAIProject:
122
129
  fields = {"subscription_id": str, "resource_group_name": str, "project_name": str}
123
130
 
124
131
  if not isinstance(o, dict):
125
- msg = "azure_ai_project must be a dictionary"
132
+ msg = "The 'azure_ai_project' parameter must be a dictionary."
126
133
  raise EvaluationException(
127
134
  message=msg,
128
- internal_message=msg,
129
- target=ErrorTarget.DIRECT_ATTACK_SIMULATOR,
130
- category=ErrorCategory.MISSING_FIELD,
135
+ category=ErrorCategory.INVALID_VALUE,
131
136
  blame=ErrorBlame.USER_ERROR,
132
137
  )
133
138
 
134
139
  missing_fields = set(fields.keys()) - o.keys()
135
140
 
136
141
  if missing_fields:
137
- msg = "azure_ai_project must contain keys: " + ", ".join(f'"{field}"' for field in missing_fields)
142
+ msg = (
143
+ "The 'azure_ai_project' dictionary is missing the following required "
144
+ f"field(s): {', '.join(f'{field}' for field in missing_fields)}."
145
+ )
138
146
  raise EvaluationException(
139
147
  message=msg,
140
- internal_message=msg,
141
- target=ErrorTarget.DIRECT_ATTACK_SIMULATOR,
142
- category=ErrorCategory.MISSING_FIELD,
148
+ category=ErrorCategory.INVALID_VALUE,
143
149
  blame=ErrorBlame.USER_ERROR,
144
150
  )
145
151
 
@@ -147,13 +153,10 @@ def validate_azure_ai_project(o: object) -> AzureAIProject:
147
153
  if isinstance(o[field_name], expected_type):
148
154
  continue
149
155
 
150
- msg = f"Expected azure_ai_project field {field_name!r} to be of type {expected_type}."
151
-
156
+ msg = f"Invalid type for field '{field_name}'. Expected {expected_type}, but got {type(o[field_name])}."
152
157
  raise EvaluationException(
153
- message=f"{msg}. Got {type(o[field_name])}.",
154
- internal_message=msg,
155
- target=ErrorTarget.DIRECT_ATTACK_SIMULATOR,
156
- category=ErrorCategory.MISSING_FIELD,
158
+ message=msg,
159
+ category=ErrorCategory.INVALID_VALUE,
157
160
  blame=ErrorBlame.USER_ERROR,
158
161
  )
159
162
 
@@ -270,3 +273,173 @@ def _validate_typed_dict(o: object, t: Type[T_TypedDict]) -> T_TypedDict:
270
273
  validate_annotation(v, annotations[k])
271
274
 
272
275
  return cast(T_TypedDict, o)
276
+
277
+
278
+ def parse_quality_evaluator_reason_score(llm_output: str) -> Tuple[float, str]:
279
+ """Parse the output of prompt-based quality evaluators that return a score and reason.
280
+
281
+ Current supported evaluators:
282
+ - Fluency
283
+ - Relevance
284
+ - Retrieval
285
+ - Groundedness
286
+ - Coherence
287
+
288
+ :param llm_output: The output of the prompt-based quality evaluator.
289
+ :type llm_output: str
290
+ :return: The score and reason.
291
+ :rtype: Tuple[float, str]
292
+ """
293
+ score = math.nan
294
+ reason = ""
295
+ if llm_output:
296
+ try:
297
+ score_pattern = r"<S2>\D*?([1-5]).*?</S2>"
298
+ reason_pattern = r"<S1>(.*?)</S1>"
299
+ score_match = re.findall(score_pattern, llm_output, re.DOTALL)
300
+ reason_match = re.findall(reason_pattern, llm_output, re.DOTALL)
301
+ if score_match:
302
+ score = float(score_match[0].strip())
303
+ if reason_match:
304
+ reason = reason_match[0].strip()
305
+ except ValueError as exc:
306
+ raise EvaluationException(
307
+ message=f"Failed to parse model output: \n{llm_output}",
308
+ internal_message="Failed to parse model output.",
309
+ category=ErrorCategory.FAILED_EXECUTION,
310
+ blame=ErrorBlame.SYSTEM_ERROR,
311
+ ) from exc
312
+
313
+ return score, reason
314
+
315
+
316
+ def remove_optional_singletons(eval_class, singletons):
317
+ required_singletons = singletons.copy()
318
+ if hasattr(eval_class, "_OPTIONAL_PARAMS"): # pylint: disable=protected-access
319
+ for param in eval_class._OPTIONAL_PARAMS: # pylint: disable=protected-access
320
+ if param in singletons:
321
+ del required_singletons[param]
322
+ return required_singletons
323
+
324
+
325
+ def retrieve_content_type(assistant_messages: List, metric: str) -> str:
326
+ """Get the content type for service payload.
327
+
328
+ :param assistant_messages: The list of messages to be annotated by evaluation service
329
+ :type assistant_messages: list
330
+ :param metric: A string representing the metric type
331
+ :type metric: str
332
+ :return: A text representing the content type. Example: 'text', or 'image'
333
+ :rtype: str
334
+ """
335
+ # Check if metric is "protected_material"
336
+ if metric == "protected_material":
337
+ return "image"
338
+
339
+ # Iterate through each message
340
+ for message in assistant_messages:
341
+ # Ensure "content" exists in the message and is iterable
342
+ if isinstance(message.get("content", []), list):
343
+ for content in message.get("content", []):
344
+ if content.get("type") == "image_url":
345
+ return "image"
346
+ # Default return if no image was found
347
+ return "text"
348
+
349
+
350
+ def validate_conversation(conversation):
351
+ def raise_exception(msg, target):
352
+ raise EvaluationException(
353
+ message=msg,
354
+ internal_message=msg,
355
+ target=target,
356
+ category=ErrorCategory.INVALID_VALUE,
357
+ blame=ErrorBlame.USER_ERROR,
358
+ )
359
+
360
+ if not conversation or "messages" not in conversation:
361
+ raise_exception(
362
+ "Attribute 'messages' is missing in the request",
363
+ ErrorTarget.CONTENT_SAFETY_CHAT_EVALUATOR,
364
+ )
365
+ messages = conversation["messages"]
366
+ if not isinstance(messages, list):
367
+ raise_exception(
368
+ "'messages' parameter must be a JSON-compatible list of chat messages",
369
+ ErrorTarget.CONTENT_SAFETY_MULTIMODAL_EVALUATOR,
370
+ )
371
+ expected_roles = {"user", "assistant", "system"}
372
+ image_found = False
373
+ assistant_message_count = 0
374
+ user_message_count = 0
375
+ for num, message in enumerate(messages, 1):
376
+ if not isinstance(message, dict):
377
+ try:
378
+ from azure.ai.inference.models import (
379
+ ChatRequestMessage,
380
+ UserMessage,
381
+ AssistantMessage,
382
+ SystemMessage,
383
+ ImageContentItem,
384
+ )
385
+ except ImportError as ex:
386
+ raise MissingRequiredPackage(
387
+ message="Please install 'azure-ai-inference' package to use SystemMessage, "
388
+ "UserMessage or AssistantMessage."
389
+ ) from ex
390
+
391
+ if isinstance(message, ChatRequestMessage) and not isinstance(
392
+ message, (UserMessage, AssistantMessage, SystemMessage)
393
+ ):
394
+ raise_exception(
395
+ f"Messages must be a strongly typed class of ChatRequestMessage. Message number: {num}",
396
+ ErrorTarget.CONTENT_SAFETY_MULTIMODAL_EVALUATOR,
397
+ )
398
+ if isinstance(message, AssistantMessage):
399
+ assistant_message_count += 1
400
+ if isinstance(message, UserMessage):
401
+ user_message_count += 1
402
+ if isinstance(message.content, list) and any(
403
+ isinstance(item, ImageContentItem) for item in message.content
404
+ ):
405
+ image_found = True
406
+ continue
407
+ if message.get("role") not in expected_roles:
408
+ raise_exception(
409
+ f"Invalid role provided: {message.get('role')}. Message number: {num}",
410
+ ErrorTarget.CONTENT_SAFETY_MULTIMODAL_EVALUATOR,
411
+ )
412
+ if message.get("role") == "assistant":
413
+ assistant_message_count += 1
414
+ if message.get("role") == "user":
415
+ user_message_count += 1
416
+ content = message.get("content")
417
+ if not isinstance(content, (str, list)):
418
+ raise_exception(
419
+ f"Content in each turn must be a string or array. Message number: {num}",
420
+ ErrorTarget.CONTENT_SAFETY_MULTIMODAL_EVALUATOR,
421
+ )
422
+ if isinstance(content, list):
423
+ if any(item.get("type") == "image_url" and "url" in item.get("image_url", {}) for item in content):
424
+ image_found = True
425
+ if not image_found:
426
+ raise_exception(
427
+ "Message needs to have multi-modal input like images.",
428
+ ErrorTarget.CONTENT_SAFETY_MULTIMODAL_EVALUATOR,
429
+ )
430
+ if assistant_message_count == 0:
431
+ raise_exception(
432
+ "Assistant role required in one of the messages.",
433
+ ErrorTarget.CONTENT_SAFETY_MULTIMODAL_EVALUATOR,
434
+ )
435
+ if user_message_count == 0:
436
+ raise_exception(
437
+ "User role required in one of the messages.",
438
+ ErrorTarget.CONTENT_SAFETY_MULTIMODAL_EVALUATOR,
439
+ )
440
+ if assistant_message_count > 1:
441
+ raise_exception(
442
+ "Evaluators for multimodal conversations only support single turn. "
443
+ "User and assistant role expected as the only role in each message.",
444
+ ErrorTarget.CONTENT_SAFETY_MULTIMODAL_EVALUATOR,
445
+ )
@@ -7,13 +7,13 @@ from typing import Literal
7
7
  class EvaluationMetrics:
8
8
  """Metrics for model evaluation."""
9
9
 
10
- GPT_GROUNDEDNESS = "gpt_groundedness"
11
- GPT_RELEVANCE = "gpt_relevance"
12
- GPT_COHERENCE = "gpt_coherence"
13
- GPT_FLUENCY = "gpt_fluency"
14
- GPT_SIMILARITY = "gpt_similarity"
10
+ GROUNDEDNESS = "groundedness"
11
+ RELEVANCE = "relevance"
12
+ COHERENCE = "coherence"
13
+ FLUENCY = "fluency"
14
+ SIMILARITY = "similarity"
15
15
  F1_SCORE = "f1_score"
16
- GPT_RETRIEVAL_SCORE = "gpt_retrieval_score"
16
+ RETRIEVAL_SCORE = "retrieval_score"
17
17
  HATE_FAIRNESS = "hate_fairness"
18
18
  HATE_UNFAIRNESS = "hate_unfairness"
19
19
  VIOLENCE = "violence"
@@ -62,6 +62,7 @@ CONTENT_SAFETY_DEFECT_RATE_THRESHOLD_DEFAULT = 4
62
62
 
63
63
  PF_BATCH_TIMEOUT_SEC_DEFAULT = 3600
64
64
  PF_BATCH_TIMEOUT_SEC = "PF_BATCH_TIMEOUT_SEC"
65
+ PF_DISABLE_TRACING = "PF_DISABLE_TRACING"
65
66
 
66
67
  OTEL_EXPORTER_OTLP_TRACES_TIMEOUT = "OTEL_EXPORTER_OTLP_TRACES_TIMEOUT"
67
68
  OTEL_EXPORTER_OTLP_TRACES_TIMEOUT_DEFAULT = 60
@@ -1,8 +1,9 @@
1
1
  # ---------------------------------------------------------
2
2
  # Copyright (c) Microsoft Corporation. All rights reserved.
3
3
  # ---------------------------------------------------------
4
- from .batch_run_context import BatchRunContext
4
+ from .eval_run_context import EvalRunContext
5
5
  from .code_client import CodeClient
6
6
  from .proxy_client import ProxyClient
7
+ from .target_run_context import TargetRunContext
7
8
 
8
- __all__ = ["CodeClient", "ProxyClient", "BatchRunContext"]
9
+ __all__ = ["CodeClient", "ProxyClient", "EvalRunContext", "TargetRunContext"]
@@ -14,6 +14,7 @@ from azure.ai.evaluation._constants import (
14
14
  OTEL_EXPORTER_OTLP_TRACES_TIMEOUT_DEFAULT,
15
15
  PF_BATCH_TIMEOUT_SEC,
16
16
  PF_BATCH_TIMEOUT_SEC_DEFAULT,
17
+ PF_DISABLE_TRACING,
17
18
  )
18
19
 
19
20
  from ..._user_agent import USER_AGENT
@@ -22,13 +23,13 @@ from .code_client import CodeClient
22
23
  from .proxy_client import ProxyClient
23
24
 
24
25
 
25
- class BatchRunContext:
26
- """Context manager for batch run clients.
26
+ class EvalRunContext:
27
+ """Context manager for eval batch run.
27
28
 
28
29
  :param client: The client to run in the context.
29
30
  :type client: Union[
30
- ~azure.ai.evaluation._evaluate._batch_run_client.code_client.CodeClient,
31
- ~azure.ai.evaluation._evaluate._batch_run_client.proxy_client.ProxyClient
31
+ ~azure.ai.evaluation._evaluate._batch_run.code_client.CodeClient,
32
+ ~azure.ai.evaluation._evaluate._batch_run.proxy_client.ProxyClient
32
33
  ]
33
34
  """
34
35
 
@@ -36,8 +37,12 @@ class BatchRunContext:
36
37
  self.client = client
37
38
  self._is_batch_timeout_set_by_system = False
38
39
  self._is_otel_timeout_set_by_system = False
40
+ self._original_cwd = os.getcwd()
39
41
 
40
42
  def __enter__(self) -> None:
43
+ # Preserve current working directory, as PF may change it without restoring it afterward
44
+ self._original_cwd = os.getcwd()
45
+
41
46
  if isinstance(self.client, CodeClient):
42
47
  ClientUserAgentUtil.append_user_agent(USER_AGENT)
43
48
  inject_openai_api()
@@ -45,6 +50,7 @@ class BatchRunContext:
45
50
  if isinstance(self.client, ProxyClient):
46
51
  os.environ[PF_FLOW_ENTRY_IN_TMP] = "true"
47
52
  os.environ[PF_FLOW_META_LOAD_IN_SUBPROCESS] = "false"
53
+ os.environ[PF_DISABLE_TRACING] = "true"
48
54
 
49
55
  if os.environ.get(PF_BATCH_TIMEOUT_SEC) is None:
50
56
  os.environ[PF_BATCH_TIMEOUT_SEC] = str(PF_BATCH_TIMEOUT_SEC_DEFAULT)
@@ -64,12 +70,15 @@ class BatchRunContext:
64
70
  exc_value: Optional[BaseException],
65
71
  exc_tb: Optional[types.TracebackType],
66
72
  ) -> None:
73
+ os.chdir(self._original_cwd)
74
+
67
75
  if isinstance(self.client, CodeClient):
68
76
  recover_openai_api()
69
77
 
70
78
  if isinstance(self.client, ProxyClient):
71
79
  os.environ.pop(PF_FLOW_ENTRY_IN_TMP, None)
72
80
  os.environ.pop(PF_FLOW_META_LOAD_IN_SUBPROCESS, None)
81
+ os.environ.pop(PF_DISABLE_TRACING, None)
73
82
 
74
83
  if self._is_batch_timeout_set_by_system:
75
84
  os.environ.pop(PF_BATCH_TIMEOUT_SEC, None)
@@ -1,13 +1,16 @@
1
1
  # ---------------------------------------------------------
2
2
  # Copyright (c) Microsoft Corporation. All rights reserved.
3
3
  # ---------------------------------------------------------
4
+
5
+ # pylint: disable=protected-access
6
+
4
7
  import inspect
5
8
  import logging
6
9
  import math
7
10
  import os
11
+ from collections import OrderedDict
8
12
  from concurrent.futures import Future
9
13
  from typing import Any, Callable, Dict, Optional, Union
10
- from collections import OrderedDict
11
14
 
12
15
  import pandas as pd
13
16
  from promptflow.client import PFClient
@@ -37,7 +40,7 @@ class ProxyClient: # pylint: disable=client-accepts-api-version-keyword
37
40
  **kwargs
38
41
  ) -> ProxyRun:
39
42
  flow_to_run = flow
40
- if hasattr(flow, "_to_async"):
43
+ if os.getenv("AI_EVALS_BATCH_USE_ASYNC", "true").lower() == "true" and hasattr(flow, "_to_async"):
41
44
  flow_to_run = flow._to_async() # pylint: disable=protected-access
42
45
 
43
46
  batch_use_async = self._should_batch_use_async(flow_to_run)
@@ -65,19 +68,29 @@ class ProxyClient: # pylint: disable=client-accepts-api-version-keyword
65
68
  run = proxy_run.run.result()
66
69
 
67
70
  # pylint: disable=protected-access
71
+ completed_lines = run._properties.get("system_metrics", {}).get("__pf__.lines.completed", "NA")
72
+ failed_lines = run._properties.get("system_metrics", {}).get("__pf__.lines.failed", "NA")
73
+
74
+ # Update status to "Completed with Errors" if the original status is "Completed" and there are failed lines
75
+ if run.status == "Completed" and failed_lines != "NA" and int(failed_lines) > 0:
76
+ status = "Completed with Errors"
77
+ else:
78
+ status = run.status
79
+
80
+ # Return the ordered dictionary with the updated status
68
81
  return OrderedDict(
69
82
  [
70
- ("status", run.status),
83
+ ("status", status),
71
84
  ("duration", str(run._end_time - run._created_on)),
72
- ("completed_lines", run._properties.get("system_metrics", {}).get("__pf__.lines.completed", "NA")),
73
- ("failed_lines", run._properties.get("system_metrics", {}).get("__pf__.lines.failed", "NA")),
85
+ ("completed_lines", completed_lines),
86
+ ("failed_lines", failed_lines),
74
87
  ("log_path", str(run._output_path)),
75
88
  ]
76
89
  )
77
90
 
78
91
  @staticmethod
79
92
  def _should_batch_use_async(flow):
80
- if os.getenv("PF_EVALS_BATCH_USE_ASYNC", "true").lower() == "true":
93
+ if os.getenv("AI_EVALS_BATCH_USE_ASYNC", "true").lower() == "true":
81
94
  if hasattr(flow, "__call__") and inspect.iscoroutinefunction(flow.__call__):
82
95
  return True
83
96
  if inspect.iscoroutinefunction(flow):
@@ -0,0 +1,46 @@
1
+ # ---------------------------------------------------------
2
+ # Copyright (c) Microsoft Corporation. All rights reserved.
3
+ # ---------------------------------------------------------
4
+ import os
5
+ import types
6
+ from typing import Optional, Type
7
+
8
+ from promptflow._sdk._constants import PF_FLOW_ENTRY_IN_TMP
9
+ from azure.ai.evaluation._constants import PF_DISABLE_TRACING
10
+
11
+
12
+ class TargetRunContext:
13
+ """Context manager for target batch run.
14
+
15
+ :param upload_snapshot: Whether to upload target snapshot.
16
+ :type upload_snapshot: bool
17
+ """
18
+
19
+ def __init__(self, upload_snapshot: bool) -> None:
20
+ self._upload_snapshot = upload_snapshot
21
+ self._original_cwd = os.getcwd()
22
+
23
+ def __enter__(self) -> None:
24
+ # Preserve current working directory, as PF may change it without restoring it afterward
25
+ self._original_cwd = os.getcwd()
26
+
27
+ # Address "[WinError 32] The process cannot access the file" error,
28
+ # caused by conflicts when the venv and target function are in the same directory.
29
+ # Setting PF_FLOW_ENTRY_IN_TMP to true uploads only the flex entry file (flow.flex.yaml).
30
+ if not self._upload_snapshot:
31
+ os.environ[PF_FLOW_ENTRY_IN_TMP] = "true"
32
+
33
+ os.environ[PF_DISABLE_TRACING] = "true"
34
+
35
+ def __exit__(
36
+ self,
37
+ exc_type: Optional[Type[BaseException]],
38
+ exc_value: Optional[BaseException],
39
+ exc_tb: Optional[types.TracebackType],
40
+ ) -> None:
41
+ os.chdir(self._original_cwd)
42
+
43
+ if not self._upload_snapshot:
44
+ os.environ.pop(PF_FLOW_ENTRY_IN_TMP, None)
45
+
46
+ os.environ.pop(PF_DISABLE_TRACING, None)
@@ -21,6 +21,7 @@ from azure.ai.evaluation._http_utils import get_http_client
21
21
  from azure.ai.evaluation._version import VERSION
22
22
  from azure.core.pipeline.policies import RetryPolicy
23
23
  from azure.core.rest import HttpResponse
24
+ from azure.core.exceptions import HttpResponseError
24
25
 
25
26
  LOGGER = logging.getLogger(__name__)
26
27
 
@@ -33,14 +34,15 @@ try:
33
34
  from azure.ai.ml.entities._datastore.datastore import Datastore
34
35
  from azure.storage.blob import BlobServiceClient
35
36
  except (ModuleNotFoundError, ImportError):
36
- # If the above mentioned modules cannot be imported, we are running
37
- # in local mode and MLClient in the constructor will be None, so
38
- # we will not arrive to Azure-dependent code.
39
-
40
- # We are logging the import failure only if debug logging level is set because:
41
- # - If the project configuration was not provided this import is not needed.
42
- # - If the project configuration was provided, the error will be raised by PFClient.
43
- LOGGER.debug("promptflow.azure is not installed.")
37
+ raise EvaluationException( # pylint: disable=raise-missing-from
38
+ message=(
39
+ "The required packages for remote tracking are missing.\n"
40
+ 'To resolve this, please install them by running "pip install azure-ai-evaluation[remote]".'
41
+ ),
42
+ target=ErrorTarget.EVALUATE,
43
+ category=ErrorCategory.MISSING_PACKAGE,
44
+ blame=ErrorBlame.USER_ERROR,
45
+ )
44
46
 
45
47
 
46
48
  @dataclasses.dataclass
@@ -102,7 +104,6 @@ class EvalRun(contextlib.AbstractContextManager): # pylint: disable=too-many-in
102
104
  _SCOPE = "https://management.azure.com/.default"
103
105
 
104
106
  EVALUATION_ARTIFACT = "instance_results.jsonl"
105
- EVALUATION_ARTIFACT_DUMMY_RUN = "eval_results.jsonl"
106
107
 
107
108
  def __init__(
108
109
  self,
@@ -412,7 +413,7 @@ class EvalRun(contextlib.AbstractContextManager): # pylint: disable=too-many-in
412
413
  """
413
414
  if not self._check_state_and_log("log artifact", {RunStatus.BROKEN, RunStatus.NOT_STARTED}, False):
414
415
  return
415
- # Check if artifact dirrectory is empty or does not exist.
416
+ # Check if artifact directory is empty or does not exist.
416
417
  if not os.path.isdir(artifact_folder):
417
418
  LOGGER.warning("The path to the artifact is either not a directory or does not exist.")
418
419
  return
@@ -443,15 +444,32 @@ class EvalRun(contextlib.AbstractContextManager): # pylint: disable=too-many-in
443
444
  datastore = self._ml_client.datastores.get_default(include_secrets=True)
444
445
  account_url = f"{datastore.account_name}.blob.{datastore.endpoint}"
445
446
  svc_client = BlobServiceClient(account_url=account_url, credential=self._get_datastore_credential(datastore))
446
- for local, remote in zip(local_paths, remote_paths["paths"]):
447
- blob_client = svc_client.get_blob_client(container=datastore.container_name, blob=remote["path"])
448
- with open(local, "rb") as fp:
449
- blob_client.upload_blob(fp, overwrite=True)
447
+ try:
448
+ for local, remote in zip(local_paths, remote_paths["paths"]):
449
+ blob_client = svc_client.get_blob_client(container=datastore.container_name, blob=remote["path"])
450
+ with open(local, "rb") as fp:
451
+ blob_client.upload_blob(fp, overwrite=True)
452
+ except HttpResponseError as ex:
453
+ if ex.status_code == 403:
454
+ msg = (
455
+ "Failed to upload evaluation run to the cloud due to insufficient permission to access the storage."
456
+ " Please ensure that the necessary access rights are granted."
457
+ )
458
+ raise EvaluationException(
459
+ message=msg,
460
+ target=ErrorTarget.EVAL_RUN,
461
+ category=ErrorCategory.FAILED_REMOTE_TRACKING,
462
+ blame=ErrorBlame.USER_ERROR,
463
+ tsg_link="https://aka.ms/azsdk/python/evaluation/remotetracking/troubleshoot",
464
+ ) from ex
465
+
466
+ raise ex
450
467
 
451
468
  # To show artifact in UI we will need to register it. If it is a promptflow run,
452
469
  # we are rewriting already registered artifact and need to skip this step.
453
470
  if self._is_promptflow_run:
454
471
  return
472
+
455
473
  url = (
456
474
  f"https://{self._url_base}/artifact/v2.0/subscriptions/{self._subscription_id}"
457
475
  f"/resourceGroups/{self._resource_group_name}/providers/"
@@ -474,6 +492,29 @@ class EvalRun(contextlib.AbstractContextManager): # pylint: disable=too-many-in
474
492
  if response.status_code != 200:
475
493
  self._log_warning("register artifact", response)
476
494
 
495
+ # register artifacts for images if exists in image folder
496
+ try:
497
+ for remote_path in remote_paths["paths"]:
498
+ remote_file_path = remote_path["path"]
499
+ if "images" in os.path.normpath(remote_file_path).split(os.sep):
500
+ response = self.request_with_retry(
501
+ url=url,
502
+ method="POST",
503
+ json_dict={
504
+ "origin": "ExperimentRun",
505
+ "container": f"dcid.{self.info.run_id}",
506
+ "path": posixpath.join("images", os.path.basename(remote_file_path)),
507
+ "dataPath": {
508
+ "dataStoreName": datastore.name,
509
+ "relativePath": remote_file_path,
510
+ },
511
+ },
512
+ )
513
+ if response.status_code != 200:
514
+ self._log_warning("register image artifact", response)
515
+ except Exception as ex: # pylint: disable=broad-exception-caught
516
+ LOGGER.debug("Exception occurred while registering image artifact. ex: %s", ex)
517
+
477
518
  def _get_datastore_credential(self, datastore: "Datastore"):
478
519
  # Reference the logic in azure.ai.ml._artifact._artifact_utilities
479
520
  # https://github.com/Azure/azure-sdk-for-python/blob/main/sdk/ml/azure-ai-ml/azure/ai/ml/_artifacts/_artifact_utilities.py#L103