azure-ai-evaluation 1.0.0__py3-none-any.whl → 1.0.0b1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of azure-ai-evaluation might be problematic. Click here for more details.

Files changed (108) hide show
  1. azure/ai/evaluation/__init__.py +4 -26
  2. azure/ai/evaluation/_common/constants.py +2 -9
  3. azure/ai/evaluation/_common/rai_service.py +122 -302
  4. azure/ai/evaluation/_common/utils.py +35 -393
  5. azure/ai/evaluation/_constants.py +6 -28
  6. azure/ai/evaluation/_evaluate/{_batch_run → _batch_run_client}/__init__.py +2 -3
  7. azure/ai/evaluation/_evaluate/{_batch_run/eval_run_context.py → _batch_run_client/batch_run_context.py} +8 -25
  8. azure/ai/evaluation/_evaluate/{_batch_run → _batch_run_client}/code_client.py +30 -68
  9. azure/ai/evaluation/_evaluate/_batch_run_client/proxy_client.py +61 -0
  10. azure/ai/evaluation/_evaluate/_eval_run.py +40 -117
  11. azure/ai/evaluation/_evaluate/_evaluate.py +255 -416
  12. azure/ai/evaluation/_evaluate/_telemetry/__init__.py +19 -24
  13. azure/ai/evaluation/_evaluate/_utils.py +47 -108
  14. azure/ai/evaluation/_evaluators/_bleu/_bleu.py +19 -18
  15. azure/ai/evaluation/_evaluators/{_retrieval → _chat}/__init__.py +2 -2
  16. azure/ai/evaluation/_evaluators/_chat/_chat.py +350 -0
  17. azure/ai/evaluation/_evaluators/{_service_groundedness → _chat/retrieval}/__init__.py +2 -2
  18. azure/ai/evaluation/_evaluators/_chat/retrieval/_retrieval.py +163 -0
  19. azure/ai/evaluation/_evaluators/_chat/retrieval/retrieval.prompty +48 -0
  20. azure/ai/evaluation/_evaluators/_coherence/_coherence.py +93 -78
  21. azure/ai/evaluation/_evaluators/_coherence/coherence.prompty +39 -76
  22. azure/ai/evaluation/_evaluators/_content_safety/__init__.py +4 -0
  23. azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py +68 -104
  24. azure/ai/evaluation/_evaluators/{_multimodal/_content_safety_multimodal_base.py → _content_safety/_content_safety_base.py} +35 -24
  25. azure/ai/evaluation/_evaluators/_content_safety/_content_safety_chat.py +296 -0
  26. azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py +54 -105
  27. azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py +52 -99
  28. azure/ai/evaluation/_evaluators/_content_safety/_sexual.py +52 -101
  29. azure/ai/evaluation/_evaluators/_content_safety/_violence.py +51 -101
  30. azure/ai/evaluation/_evaluators/_eci/_eci.py +55 -45
  31. azure/ai/evaluation/_evaluators/_f1_score/_f1_score.py +20 -36
  32. azure/ai/evaluation/_evaluators/_fluency/_fluency.py +94 -76
  33. azure/ai/evaluation/_evaluators/_fluency/fluency.prompty +41 -66
  34. azure/ai/evaluation/_evaluators/_gleu/_gleu.py +17 -15
  35. azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +92 -113
  36. azure/ai/evaluation/_evaluators/_groundedness/groundedness.prompty +54 -0
  37. azure/ai/evaluation/_evaluators/_meteor/_meteor.py +27 -21
  38. azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py +80 -89
  39. azure/ai/evaluation/_evaluators/_protected_materials/__init__.py +5 -0
  40. azure/ai/evaluation/_evaluators/_protected_materials/_protected_materials.py +104 -0
  41. azure/ai/evaluation/_evaluators/_qa/_qa.py +43 -25
  42. azure/ai/evaluation/_evaluators/_relevance/_relevance.py +101 -84
  43. azure/ai/evaluation/_evaluators/_relevance/relevance.prompty +47 -78
  44. azure/ai/evaluation/_evaluators/_rouge/_rouge.py +27 -27
  45. azure/ai/evaluation/_evaluators/_similarity/_similarity.py +45 -55
  46. azure/ai/evaluation/_evaluators/_similarity/similarity.prompty +5 -0
  47. azure/ai/evaluation/_evaluators/_xpia/xpia.py +106 -91
  48. azure/ai/evaluation/_exceptions.py +7 -28
  49. azure/ai/evaluation/_http_utils.py +134 -205
  50. azure/ai/evaluation/_model_configurations.py +8 -104
  51. azure/ai/evaluation/_version.py +1 -1
  52. azure/ai/evaluation/simulator/__init__.py +2 -3
  53. azure/ai/evaluation/simulator/_adversarial_scenario.py +1 -20
  54. azure/ai/evaluation/simulator/_adversarial_simulator.py +95 -116
  55. azure/ai/evaluation/simulator/_constants.py +1 -11
  56. azure/ai/evaluation/simulator/_conversation/__init__.py +13 -14
  57. azure/ai/evaluation/simulator/_conversation/_conversation.py +20 -20
  58. azure/ai/evaluation/simulator/_direct_attack_simulator.py +68 -34
  59. azure/ai/evaluation/simulator/_helpers/__init__.py +1 -1
  60. azure/ai/evaluation/simulator/_helpers/_simulator_data_classes.py +28 -31
  61. azure/ai/evaluation/simulator/_indirect_attack_simulator.py +95 -108
  62. azure/ai/evaluation/simulator/_model_tools/_identity_manager.py +22 -70
  63. azure/ai/evaluation/simulator/_model_tools/_proxy_completion_model.py +14 -30
  64. azure/ai/evaluation/simulator/_model_tools/_rai_client.py +14 -25
  65. azure/ai/evaluation/simulator/_model_tools/_template_handler.py +24 -68
  66. azure/ai/evaluation/simulator/_model_tools/models.py +21 -19
  67. azure/ai/evaluation/simulator/_prompty/task_query_response.prompty +10 -6
  68. azure/ai/evaluation/simulator/_prompty/task_simulate.prompty +5 -6
  69. azure/ai/evaluation/simulator/_tracing.py +28 -25
  70. azure/ai/evaluation/simulator/_utils.py +13 -34
  71. azure/ai/evaluation/simulator/simulator.py +579 -0
  72. azure_ai_evaluation-1.0.0b1.dist-info/METADATA +377 -0
  73. azure_ai_evaluation-1.0.0b1.dist-info/RECORD +97 -0
  74. {azure_ai_evaluation-1.0.0.dist-info → azure_ai_evaluation-1.0.0b1.dist-info}/WHEEL +1 -1
  75. azure/ai/evaluation/_common/_experimental.py +0 -172
  76. azure/ai/evaluation/_common/math.py +0 -89
  77. azure/ai/evaluation/_evaluate/_batch_run/proxy_client.py +0 -99
  78. azure/ai/evaluation/_evaluate/_batch_run/target_run_context.py +0 -46
  79. azure/ai/evaluation/_evaluators/_common/__init__.py +0 -13
  80. azure/ai/evaluation/_evaluators/_common/_base_eval.py +0 -344
  81. azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +0 -88
  82. azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py +0 -133
  83. azure/ai/evaluation/_evaluators/_groundedness/groundedness_with_query.prompty +0 -113
  84. azure/ai/evaluation/_evaluators/_groundedness/groundedness_without_query.prompty +0 -99
  85. azure/ai/evaluation/_evaluators/_multimodal/__init__.py +0 -20
  86. azure/ai/evaluation/_evaluators/_multimodal/_content_safety_multimodal.py +0 -132
  87. azure/ai/evaluation/_evaluators/_multimodal/_hate_unfairness.py +0 -100
  88. azure/ai/evaluation/_evaluators/_multimodal/_protected_material.py +0 -124
  89. azure/ai/evaluation/_evaluators/_multimodal/_self_harm.py +0 -100
  90. azure/ai/evaluation/_evaluators/_multimodal/_sexual.py +0 -100
  91. azure/ai/evaluation/_evaluators/_multimodal/_violence.py +0 -100
  92. azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py +0 -112
  93. azure/ai/evaluation/_evaluators/_retrieval/retrieval.prompty +0 -93
  94. azure/ai/evaluation/_evaluators/_service_groundedness/_service_groundedness.py +0 -148
  95. azure/ai/evaluation/_vendor/__init__.py +0 -3
  96. azure/ai/evaluation/_vendor/rouge_score/__init__.py +0 -14
  97. azure/ai/evaluation/_vendor/rouge_score/rouge_scorer.py +0 -328
  98. azure/ai/evaluation/_vendor/rouge_score/scoring.py +0 -63
  99. azure/ai/evaluation/_vendor/rouge_score/tokenize.py +0 -63
  100. azure/ai/evaluation/_vendor/rouge_score/tokenizers.py +0 -53
  101. azure/ai/evaluation/simulator/_data_sources/__init__.py +0 -3
  102. azure/ai/evaluation/simulator/_data_sources/grounding.json +0 -1150
  103. azure/ai/evaluation/simulator/_prompty/__init__.py +0 -0
  104. azure/ai/evaluation/simulator/_simulator.py +0 -716
  105. azure_ai_evaluation-1.0.0.dist-info/METADATA +0 -595
  106. azure_ai_evaluation-1.0.0.dist-info/NOTICE.txt +0 -70
  107. azure_ai_evaluation-1.0.0.dist-info/RECORD +0 -119
  108. {azure_ai_evaluation-1.0.0.dist-info → azure_ai_evaluation-1.0.0b1.dist-info}/top_level.txt +0 -0
@@ -4,17 +4,13 @@
4
4
  import inspect
5
5
  import json
6
6
  import logging
7
- import os
8
- from concurrent.futures import Future
9
- from pathlib import Path
10
- from typing import Any, Callable, Dict, Optional, Union, cast
11
7
 
12
8
  import pandas as pd
13
- from promptflow.contracts.types import AttrDict
14
- from promptflow.tracing import ThreadPoolExecutorWithContext as ThreadPoolExecutor
15
9
 
10
+ from promptflow.contracts.types import AttrDict
16
11
  from azure.ai.evaluation._evaluate._utils import _apply_column_mapping, _has_aggregator, get_int_env_var, load_jsonl
17
- from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
12
+ from promptflow.tracing import ThreadPoolExecutorWithContext as ThreadPoolExecutor
13
+ from azure.ai.evaluation._exceptions import EvaluationException, ErrorBlame, ErrorCategory, ErrorTarget
18
14
 
19
15
  from ..._constants import PF_BATCH_TIMEOUT_SEC, PF_BATCH_TIMEOUT_SEC_DEFAULT
20
16
 
@@ -22,43 +18,35 @@ LOGGER = logging.getLogger(__name__)
22
18
 
23
19
 
24
20
  class CodeRun:
25
- def __init__(
26
- self,
27
- *,
28
- run: Future,
29
- input_data,
30
- evaluator_name: Optional[str] = None,
31
- aggregator: Callable[["CodeRun"], Future],
32
- **kwargs, # pylint: disable=unused-argument
33
- ) -> None:
21
+ def __init__(self, run, input_data, evaluator_name=None, aggregated_metrics=None, **kwargs):
34
22
  self.run = run
35
23
  self.evaluator_name = evaluator_name if evaluator_name is not None else ""
36
24
  self.input_data = input_data
37
- self.aggregated_metrics = aggregator(self)
25
+ self.aggregated_metrics = aggregated_metrics
38
26
 
39
- def get_result_df(self, exclude_inputs: bool = False) -> pd.DataFrame:
27
+ def get_result_df(self, exclude_inputs=False):
40
28
  batch_run_timeout = get_int_env_var(PF_BATCH_TIMEOUT_SEC, PF_BATCH_TIMEOUT_SEC_DEFAULT)
41
- result_df = cast(pd.DataFrame, self.run.result(timeout=batch_run_timeout))
29
+ result_df = self.run.result(timeout=batch_run_timeout)
42
30
  if exclude_inputs:
43
31
  result_df = result_df.drop(columns=[col for col in result_df.columns if col.startswith("inputs.")])
44
32
  return result_df
45
33
 
46
- def get_aggregated_metrics(self) -> Dict[str, Any]:
34
+ def get_aggregated_metrics(self):
47
35
  try:
48
36
  batch_run_timeout = get_int_env_var(PF_BATCH_TIMEOUT_SEC, PF_BATCH_TIMEOUT_SEC_DEFAULT)
49
- aggregated_metrics: Optional[Any] = (
50
- cast(Dict, self.aggregated_metrics.result(timeout=batch_run_timeout))
37
+ aggregated_metrics = (
38
+ self.aggregated_metrics.result(timeout=batch_run_timeout)
51
39
  if self.aggregated_metrics is not None
52
40
  else None
53
41
  )
54
42
  except Exception as ex: # pylint: disable=broad-exception-caught
55
- LOGGER.debug("Error calculating metrics for evaluator %s, failed with error %s", self.evaluator_name, ex)
43
+ LOGGER.debug(f"Error calculating metrics for evaluator {self.evaluator_name}, failed with error {str(ex)}")
56
44
  aggregated_metrics = None
57
45
 
58
46
  if not isinstance(aggregated_metrics, dict):
59
47
  LOGGER.warning(
60
- "Aggregated metrics for evaluator %s is not a dictionary will not be logged as metrics",
61
- self.evaluator_name,
48
+ f"Aggregated metrics for evaluator {self.evaluator_name}"
49
+ f" is not a dictionary will not be logged as metrics"
62
50
  )
63
51
 
64
52
  aggregated_metrics = aggregated_metrics if isinstance(aggregated_metrics, dict) else {}
@@ -66,15 +54,11 @@ class CodeRun:
66
54
  return aggregated_metrics
67
55
 
68
56
 
69
- class CodeClient: # pylint: disable=client-accepts-api-version-keyword
70
- def __init__( # pylint: disable=missing-client-constructor-parameter-credential,missing-client-constructor-parameter-kwargs
71
- self,
72
- ) -> None:
57
+ class CodeClient:
58
+ def __init__(self):
73
59
  self._thread_pool = ThreadPoolExecutor(thread_name_prefix="evaluators_thread")
74
60
 
75
- def _calculate_metric(
76
- self, evaluator: Callable, input_df: pd.DataFrame, column_mapping: Optional[Dict[str, str]], evaluator_name: str
77
- ) -> pd.DataFrame:
61
+ def _calculate_metric(self, evaluator, input_df, column_mapping, evaluator_name):
78
62
  row_metric_futures = []
79
63
  row_metric_results = []
80
64
  input_df = _apply_column_mapping(input_df, column_mapping)
@@ -111,10 +95,10 @@ class CodeClient: # pylint: disable=client-accepts-api-version-keyword
111
95
  verify_integrity=True,
112
96
  )
113
97
 
114
- @staticmethod
115
- def _calculate_aggregations(evaluator: Callable, run: CodeRun) -> Any:
98
+ def _calculate_aggregations(self, evaluator, run):
116
99
  try:
117
100
  if _has_aggregator(evaluator):
101
+ aggregate_input = None
118
102
  evaluator_output = run.get_result_df(exclude_inputs=True)
119
103
  if len(evaluator_output.columns) == 1 and evaluator_output.columns[0] == "output":
120
104
  aggregate_input = evaluator_output["output"].tolist()
@@ -126,25 +110,18 @@ class CodeClient: # pylint: disable=client-accepts-api-version-keyword
126
110
  return aggregated_output
127
111
  except Exception as ex: # pylint: disable=broad-exception-caught
128
112
  LOGGER.warning(
129
- "Error calculating aggregations for evaluator %s, failed with error %s", run.evaluator_name, ex
113
+ f"Error calculating aggregations for evaluator {run.evaluator_name}," f" failed with error {str(ex)}"
130
114
  )
131
115
  return None
132
116
 
133
- def run(
134
- self, # pylint: disable=unused-argument
135
- flow: Callable,
136
- data: Union[os.PathLike, Path, pd.DataFrame],
137
- evaluator_name: Optional[str] = None,
138
- column_mapping: Optional[Dict[str, str]] = None,
139
- **kwargs,
140
- ) -> CodeRun:
117
+ def run(self, flow, data, evaluator_name=None, column_mapping=None, **kwargs):
141
118
  input_df = data
142
119
  if not isinstance(input_df, pd.DataFrame):
143
120
  try:
144
121
  json_data = load_jsonl(data)
145
122
  except json.JSONDecodeError as exc:
146
123
  raise EvaluationException(
147
- message=f"Failed to parse data as JSON: {data}. Provide valid json lines data.",
124
+ message = f"Failed to parse data as JSON: {data}. Provide valid json lines data.",
148
125
  internal_message="Failed to parse data as JSON",
149
126
  target=ErrorTarget.CODE_CLIENT,
150
127
  category=ErrorCategory.INVALID_VALUE,
@@ -152,37 +129,22 @@ class CodeClient: # pylint: disable=client-accepts-api-version-keyword
152
129
  ) from exc
153
130
 
154
131
  input_df = pd.DataFrame(json_data)
155
- eval_future = self._thread_pool.submit(
156
- self._calculate_metric,
157
- evaluator=flow,
158
- input_df=input_df,
159
- column_mapping=column_mapping,
160
- evaluator_name=evaluator_name,
161
- )
132
+ eval_future = self._thread_pool.submit(self._calculate_metric, flow, input_df, column_mapping, evaluator_name)
133
+ run = CodeRun(run=eval_future, input_data=data, evaluator_name=evaluator_name, aggregated_metrics=None)
134
+ aggregation_future = self._thread_pool.submit(self._calculate_aggregations, evaluator=flow, run=run)
135
+ run.aggregated_metrics = aggregation_future
136
+ return run
162
137
 
163
- return CodeRun(
164
- run=eval_future,
165
- input_data=data,
166
- evaluator_name=evaluator_name,
167
- aggregator=lambda code_run: self._thread_pool.submit(
168
- self._calculate_aggregations, evaluator=flow, run=code_run
169
- ),
170
- )
171
-
172
- def get_details(self, run: CodeRun, all_results: bool = False) -> pd.DataFrame:
138
+ def get_details(self, run, all_results=False):
173
139
  result_df = run.get_result_df(exclude_inputs=not all_results)
174
140
  return result_df
175
141
 
176
- def get_metrics(self, run: CodeRun) -> Dict[str, Any]:
142
+ def get_metrics(self, run):
177
143
  try:
178
144
  aggregated_metrics = run.get_aggregated_metrics()
179
145
  print("Aggregated metrics")
180
146
  print(aggregated_metrics)
181
147
  except Exception as ex: # pylint: disable=broad-exception-caught
182
- LOGGER.debug("Error calculating metrics for evaluator %s, failed with error %s", run.evaluator_name, ex)
183
- return {}
148
+ LOGGER.debug(f"Error calculating metrics for evaluator {run.evaluator_name}, failed with error {str(ex)}")
149
+ return None
184
150
  return aggregated_metrics
185
-
186
- def get_run_summary(self, run: CodeRun) -> Any: # pylint: disable=unused-argument
187
- # Not implemented
188
- return None
@@ -0,0 +1,61 @@
1
+ # ---------------------------------------------------------
2
+ # Copyright (c) Microsoft Corporation. All rights reserved.
3
+ # ---------------------------------------------------------
4
+ import inspect
5
+ import logging
6
+ import os
7
+
8
+ import numpy as np
9
+
10
+ from promptflow.client import PFClient
11
+ from promptflow.tracing import ThreadPoolExecutorWithContext as ThreadPoolExecutor
12
+
13
+ LOGGER = logging.getLogger(__name__)
14
+
15
+
16
+ class ProxyRun:
17
+ def __init__(self, run, **kwargs):
18
+ self.run = run
19
+
20
+
21
+ class ProxyClient:
22
+ def __init__(self, pf_client: PFClient):
23
+ self._pf_client = pf_client
24
+ self._thread_pool = ThreadPoolExecutor(thread_name_prefix="evaluators_thread")
25
+
26
+ def run(self, flow, data, column_mapping=None, **kwargs):
27
+ flow_to_run = flow
28
+ if hasattr(flow, "_to_async"):
29
+ flow_to_run = flow._to_async()
30
+
31
+ batch_use_async = self._should_batch_use_async(flow_to_run)
32
+ eval_future = self._thread_pool.submit(
33
+ self._pf_client.run,
34
+ flow_to_run,
35
+ data=data,
36
+ column_mapping=column_mapping,
37
+ batch_use_async=batch_use_async,
38
+ **kwargs
39
+ )
40
+ return ProxyRun(run=eval_future)
41
+
42
+ def get_details(self, proxy_run, all_results=False):
43
+ run = proxy_run.run.result()
44
+ result_df = self._pf_client.get_details(run, all_results=all_results)
45
+ result_df.replace("(Failed)", np.nan, inplace=True)
46
+ return result_df
47
+
48
+ def get_metrics(self, proxy_run):
49
+ run = proxy_run.run.result()
50
+ return self._pf_client.get_metrics(run)
51
+
52
+ @staticmethod
53
+ def _should_batch_use_async(flow):
54
+ if os.getenv("PF_EVALS_BATCH_USE_ASYNC", "true").lower() == "true":
55
+ if hasattr(flow, "__call__") and inspect.iscoroutinefunction(flow.__call__):
56
+ return True
57
+ elif inspect.iscoroutinefunction(flow):
58
+ return True
59
+ else:
60
+ return False
61
+ return False
@@ -8,20 +8,17 @@ import logging
8
8
  import os
9
9
  import posixpath
10
10
  import time
11
- import types
12
11
  import uuid
13
- from typing import Any, Dict, List, Optional, Set, Type
12
+ from typing import Any, Dict, Optional, Set
14
13
  from urllib.parse import urlparse
15
14
 
16
- from promptflow._sdk.entities import Run
17
- from typing_extensions import Self
15
+ from azure.core.pipeline.policies import RetryPolicy
16
+ from azure.core.rest import HttpResponse
18
17
 
19
- from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
18
+ from promptflow._sdk.entities import Run
20
19
  from azure.ai.evaluation._http_utils import get_http_client
21
20
  from azure.ai.evaluation._version import VERSION
22
- from azure.core.pipeline.policies import RetryPolicy
23
- from azure.core.rest import HttpResponse
24
- from azure.core.exceptions import HttpResponseError
21
+ from azure.ai.evaluation._exceptions import EvaluationException, ErrorBlame, ErrorCategory, ErrorTarget
25
22
 
26
23
  LOGGER = logging.getLogger(__name__)
27
24
 
@@ -29,20 +26,18 @@ LOGGER = logging.getLogger(__name__)
29
26
  # Handle optional import. The azure libraries are only present if
30
27
  # promptflow-azure is installed.
31
28
  try:
32
- from azure.ai.ml import MLClient
33
29
  from azure.ai.ml.entities._credentials import AccountKeyConfiguration # pylint: disable=ungrouped-imports
34
30
  from azure.ai.ml.entities._datastore.datastore import Datastore
35
31
  from azure.storage.blob import BlobServiceClient
36
32
  except (ModuleNotFoundError, ImportError):
37
- raise EvaluationException( # pylint: disable=raise-missing-from
38
- message=(
39
- "The required packages for remote tracking are missing.\n"
40
- 'To resolve this, please install them by running "pip install azure-ai-evaluation[remote]".'
41
- ),
42
- target=ErrorTarget.EVALUATE,
43
- category=ErrorCategory.MISSING_PACKAGE,
44
- blame=ErrorBlame.USER_ERROR,
45
- )
33
+ # If the above mentioned modules cannot be imported, we are running
34
+ # in local mode and MLClient in the constructor will be None, so
35
+ # we will not arrive to Azure-dependent code.
36
+
37
+ # We are logging the import failure only if debug logging level is set because:
38
+ # - If the project configuration was not provided this import is not needed.
39
+ # - If the project configuration was provided, the error will be raised by PFClient.
40
+ LOGGER.debug("promptflow.azure is not installed.")
46
41
 
47
42
 
48
43
  @dataclasses.dataclass
@@ -104,6 +99,7 @@ class EvalRun(contextlib.AbstractContextManager): # pylint: disable=too-many-in
104
99
  _SCOPE = "https://management.azure.com/.default"
105
100
 
106
101
  EVALUATION_ARTIFACT = "instance_results.jsonl"
102
+ EVALUATION_ARTIFACT_DUMMY_RUN = "eval_results.jsonl"
107
103
 
108
104
  def __init__(
109
105
  self,
@@ -124,8 +120,8 @@ class EvalRun(contextlib.AbstractContextManager): # pylint: disable=too-many-in
124
120
  self._run_name = run_name
125
121
  self._promptflow_run = promptflow_run
126
122
  self._status = RunStatus.NOT_STARTED
127
- self._url_base: Optional[str] = None
128
- self._info: Optional[RunInfo] = None
123
+ self._url_base = None
124
+ self.info = None
129
125
 
130
126
  @property
131
127
  def status(self) -> RunStatus:
@@ -137,20 +133,6 @@ class EvalRun(contextlib.AbstractContextManager): # pylint: disable=too-many-in
137
133
  """
138
134
  return self._status
139
135
 
140
- @property
141
- def info(self) -> RunInfo:
142
- if self._info is None:
143
- msg = "Run info is missing"
144
- raise EvaluationException(
145
- message=msg,
146
- internal_message=msg,
147
- target=ErrorTarget.EVAL_RUN,
148
- category=ErrorCategory.UNKNOWN,
149
- blame=ErrorBlame.UNKNOWN,
150
- )
151
-
152
- return self._info
153
-
154
136
  def _get_scope(self) -> str:
155
137
  """
156
138
  Return the scope information for the workspace.
@@ -178,14 +160,12 @@ class EvalRun(contextlib.AbstractContextManager): # pylint: disable=too-many-in
178
160
  )
179
161
  self._url_base = None
180
162
  self._status = RunStatus.BROKEN
181
- self._info = RunInfo.generate(self._run_name)
163
+ self.info = RunInfo.generate(self._run_name)
182
164
  else:
183
165
  self._url_base = urlparse(self._tracking_uri).netloc
184
166
  if self._promptflow_run is not None:
185
- self._info = RunInfo(
186
- self._promptflow_run.name,
187
- self._promptflow_run._experiment_name, # pylint: disable=protected-access
188
- self._promptflow_run.name,
167
+ self.info = RunInfo(
168
+ self._promptflow_run.name, self._promptflow_run._experiment_name, self._promptflow_run.name
189
169
  )
190
170
  else:
191
171
  url = f"https://{self._url_base}/mlflow/v2.0" f"{self._get_scope()}/api/2.0/mlflow/runs/create"
@@ -199,17 +179,15 @@ class EvalRun(contextlib.AbstractContextManager): # pylint: disable=too-many-in
199
179
  body["run_name"] = self._run_name
200
180
  response = self.request_with_retry(url=url, method="POST", json_dict=body)
201
181
  if response.status_code != 200:
202
- self._info = RunInfo.generate(self._run_name)
182
+ self.info = RunInfo.generate(self._run_name)
203
183
  LOGGER.warning(
204
- "The run failed to start: %s: %s."
205
- "The results will be saved locally, but will not be logged to Azure.",
206
- response.status_code,
207
- response.text(),
184
+ f"The run failed to start: {response.status_code}: {response.text()}."
185
+ "The results will be saved locally, but will not be logged to Azure."
208
186
  )
209
187
  self._status = RunStatus.BROKEN
210
188
  else:
211
189
  parsed_response = response.json()
212
- self._info = RunInfo(
190
+ self.info = RunInfo(
213
191
  run_id=parsed_response["run"]["info"]["run_id"],
214
192
  experiment_id=parsed_response["run"]["info"]["experiment_id"],
215
193
  run_name=parsed_response["run"]["info"]["run_name"],
@@ -238,7 +216,7 @@ class EvalRun(contextlib.AbstractContextManager): # pylint: disable=too-many-in
238
216
  internal_message="Incorrect terminal status. Valid statuses are 'FINISHED', 'FAILED' and 'KILLED'",
239
217
  target=ErrorTarget.EVAL_RUN,
240
218
  category=ErrorCategory.FAILED_EXECUTION,
241
- blame=ErrorBlame.UNKNOWN,
219
+ blame=ErrorBlame.UNKNOWN
242
220
  )
243
221
  url = f"https://{self._url_base}/mlflow/v2.0" f"{self._get_scope()}/api/2.0/mlflow/runs/update"
244
222
  body = {
@@ -252,7 +230,7 @@ class EvalRun(contextlib.AbstractContextManager): # pylint: disable=too-many-in
252
230
  LOGGER.warning("Unable to terminate the run.")
253
231
  self._status = RunStatus.TERMINATED
254
232
 
255
- def __enter__(self) -> Self:
233
+ def __enter__(self):
256
234
  """The Context Manager enter call.
257
235
 
258
236
  :return: The instance of the class.
@@ -261,21 +239,8 @@ class EvalRun(contextlib.AbstractContextManager): # pylint: disable=too-many-in
261
239
  self._start_run()
262
240
  return self
263
241
 
264
- def __exit__(
265
- self,
266
- exc_type: Optional[Type[BaseException]],
267
- exc_value: Optional[BaseException],
268
- exc_tb: Optional[types.TracebackType],
269
- ) -> None:
270
- """The context manager exit call.
271
-
272
- :param exc_type: The exception type
273
- :type exc_type: Optional[Type[BaseException]]
274
- :param exc_value: The exception value
275
- :type exc_value: Optional[BaseException]
276
- :param exc_tb: The exception traceback
277
- :type exc_tb: Optional[types.TracebackType]
278
- """
242
+ def __exit__(self, exc_type, exc_value, exc_tb):
243
+ """The context manager exit call."""
279
244
  self._end_run("FINISHED")
280
245
 
281
246
  def get_run_history_uri(self) -> str:
@@ -315,7 +280,7 @@ class EvalRun(contextlib.AbstractContextManager): # pylint: disable=too-many-in
315
280
  # is an optional dependency.
316
281
  from promptflow.azure._utils._token_cache import ArmTokenCache # pylint: disable=import-error,no-name-in-module
317
282
 
318
- return ArmTokenCache().get_token(self._ml_client._credential) # pylint: disable=protected-access
283
+ return ArmTokenCache().get_token(self._ml_client._credential)
319
284
 
320
285
  def request_with_retry(
321
286
  self, url: str, method: str, json_dict: Dict[str, Any], headers: Optional[Dict[str, str]] = None
@@ -361,10 +326,9 @@ class EvalRun(contextlib.AbstractContextManager): # pylint: disable=too-many-in
361
326
  :type response: HttpResponse
362
327
  """
363
328
  LOGGER.warning(
364
- "Unable to %s, the request failed with status code %s, response.text()=%s.",
365
- failed_op,
366
- response.status_code,
367
- response.text(),
329
+ f"Unable to {failed_op}, "
330
+ f"the request failed with status code {response.status_code}, "
331
+ f"{response.text()=}."
368
332
  )
369
333
 
370
334
  def _check_state_and_log(self, action: str, bad_states: Set[RunStatus], should_raise: bool) -> bool:
@@ -378,8 +342,7 @@ class EvalRun(contextlib.AbstractContextManager): # pylint: disable=too-many-in
378
342
  :type bad_states: Set[RunStatus]
379
343
  :param should_raise: Should we raise an error if the bad state has been encountered
380
344
  :type should_raise: bool
381
- :raises: ~azure.ai.evaluations._exceptions.EvaluationException if should_raise is True
382
- and invalid state was encountered.
345
+ :raises: ~azure.ai.evaluations._exceptions.EvaluationException if should_raise is True and invalid state was encountered.
383
346
  :return: Whether or not run is in the correct state.
384
347
  :rtype: bool
385
348
  """
@@ -391,7 +354,7 @@ class EvalRun(contextlib.AbstractContextManager): # pylint: disable=too-many-in
391
354
  internal_message=msg,
392
355
  target=ErrorTarget.EVAL_RUN,
393
356
  category=ErrorCategory.FAILED_EXECUTION,
394
- blame=ErrorBlame.UNKNOWN,
357
+ blame=ErrorBlame.UNKNOWN
395
358
  )
396
359
  LOGGER.warning(msg)
397
360
  return False
@@ -413,7 +376,7 @@ class EvalRun(contextlib.AbstractContextManager): # pylint: disable=too-many-in
413
376
  """
414
377
  if not self._check_state_and_log("log artifact", {RunStatus.BROKEN, RunStatus.NOT_STARTED}, False):
415
378
  return
416
- # Check if artifact directory is empty or does not exist.
379
+ # Check if artifact dirrectory is empty or does not exist.
417
380
  if not os.path.isdir(artifact_folder):
418
381
  LOGGER.warning("The path to the artifact is either not a directory or does not exist.")
419
382
  return
@@ -425,7 +388,7 @@ class EvalRun(contextlib.AbstractContextManager): # pylint: disable=too-many-in
425
388
  return
426
389
  # First we will list the files and the appropriate remote paths for them.
427
390
  root_upload_path = posixpath.join("promptflow", "PromptFlowArtifacts", self.info.run_name)
428
- remote_paths: Dict[str, List[Dict[str, str]]] = {"paths": []}
391
+ remote_paths = {"paths": []}
429
392
  local_paths = []
430
393
  # Go over the artifact folder and upload all artifacts.
431
394
  for root, _, filenames in os.walk(artifact_folder):
@@ -444,32 +407,15 @@ class EvalRun(contextlib.AbstractContextManager): # pylint: disable=too-many-in
444
407
  datastore = self._ml_client.datastores.get_default(include_secrets=True)
445
408
  account_url = f"{datastore.account_name}.blob.{datastore.endpoint}"
446
409
  svc_client = BlobServiceClient(account_url=account_url, credential=self._get_datastore_credential(datastore))
447
- try:
448
- for local, remote in zip(local_paths, remote_paths["paths"]):
449
- blob_client = svc_client.get_blob_client(container=datastore.container_name, blob=remote["path"])
450
- with open(local, "rb") as fp:
451
- blob_client.upload_blob(fp, overwrite=True)
452
- except HttpResponseError as ex:
453
- if ex.status_code == 403:
454
- msg = (
455
- "Failed to upload evaluation run to the cloud due to insufficient permission to access the storage."
456
- " Please ensure that the necessary access rights are granted."
457
- )
458
- raise EvaluationException(
459
- message=msg,
460
- target=ErrorTarget.EVAL_RUN,
461
- category=ErrorCategory.FAILED_REMOTE_TRACKING,
462
- blame=ErrorBlame.USER_ERROR,
463
- tsg_link="https://aka.ms/azsdk/python/evaluation/remotetracking/troubleshoot",
464
- ) from ex
465
-
466
- raise ex
410
+ for local, remote in zip(local_paths, remote_paths["paths"]):
411
+ blob_client = svc_client.get_blob_client(container=datastore.container_name, blob=remote["path"])
412
+ with open(local, "rb") as fp:
413
+ blob_client.upload_blob(fp, overwrite=True)
467
414
 
468
415
  # To show artifact in UI we will need to register it. If it is a promptflow run,
469
416
  # we are rewriting already registered artifact and need to skip this step.
470
417
  if self._is_promptflow_run:
471
418
  return
472
-
473
419
  url = (
474
420
  f"https://{self._url_base}/artifact/v2.0/subscriptions/{self._subscription_id}"
475
421
  f"/resourceGroups/{self._resource_group_name}/providers/"
@@ -492,29 +438,6 @@ class EvalRun(contextlib.AbstractContextManager): # pylint: disable=too-many-in
492
438
  if response.status_code != 200:
493
439
  self._log_warning("register artifact", response)
494
440
 
495
- # register artifacts for images if exists in image folder
496
- try:
497
- for remote_path in remote_paths["paths"]:
498
- remote_file_path = remote_path["path"]
499
- if "images" in os.path.normpath(remote_file_path).split(os.sep):
500
- response = self.request_with_retry(
501
- url=url,
502
- method="POST",
503
- json_dict={
504
- "origin": "ExperimentRun",
505
- "container": f"dcid.{self.info.run_id}",
506
- "path": posixpath.join("images", os.path.basename(remote_file_path)),
507
- "dataPath": {
508
- "dataStoreName": datastore.name,
509
- "relativePath": remote_file_path,
510
- },
511
- },
512
- )
513
- if response.status_code != 200:
514
- self._log_warning("register image artifact", response)
515
- except Exception as ex: # pylint: disable=broad-exception-caught
516
- LOGGER.debug("Exception occurred while registering image artifact. ex: %s", ex)
517
-
518
441
  def _get_datastore_credential(self, datastore: "Datastore"):
519
442
  # Reference the logic in azure.ai.ml._artifact._artifact_utilities
520
443
  # https://github.com/Azure/azure-sdk-for-python/blob/main/sdk/ml/azure-ai-ml/azure/ai/ml/_artifacts/_artifact_utilities.py#L103
@@ -523,7 +446,7 @@ class EvalRun(contextlib.AbstractContextManager): # pylint: disable=too-many-in
523
446
  return credential.account_key
524
447
  if hasattr(credential, "sas_token"):
525
448
  return credential.sas_token
526
- return self._ml_client.datastores._credential # pylint: disable=protected-access
449
+ return self._ml_client.datastores._credential
527
450
 
528
451
  def log_metric(self, key: str, value: float) -> None:
529
452
  """