azure-ai-evaluation 1.0.0b1__py3-none-any.whl → 1.0.0b2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of azure-ai-evaluation might be problematic. Click here for more details.

Files changed (58) hide show
  1. azure/ai/evaluation/__init__.py +1 -5
  2. azure/ai/evaluation/_common/rai_service.py +4 -4
  3. azure/ai/evaluation/_common/utils.py +19 -19
  4. azure/ai/evaluation/_constants.py +9 -0
  5. azure/ai/evaluation/_evaluate/_batch_run_client/batch_run_context.py +2 -1
  6. azure/ai/evaluation/_evaluate/_batch_run_client/code_client.py +39 -17
  7. azure/ai/evaluation/_evaluate/_batch_run_client/proxy_client.py +23 -13
  8. azure/ai/evaluation/_evaluate/_eval_run.py +38 -18
  9. azure/ai/evaluation/_evaluate/_evaluate.py +35 -28
  10. azure/ai/evaluation/_evaluate/_telemetry/__init__.py +13 -8
  11. azure/ai/evaluation/_evaluate/_utils.py +29 -22
  12. azure/ai/evaluation/_evaluators/_bleu/_bleu.py +1 -1
  13. azure/ai/evaluation/_evaluators/_chat/_chat.py +16 -9
  14. azure/ai/evaluation/_evaluators/_chat/retrieval/_retrieval.py +4 -10
  15. azure/ai/evaluation/_evaluators/_coherence/_coherence.py +5 -10
  16. azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py +0 -2
  17. azure/ai/evaluation/_evaluators/_content_safety/_content_safety_base.py +1 -2
  18. azure/ai/evaluation/_evaluators/_content_safety/_content_safety_chat.py +9 -4
  19. azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py +1 -1
  20. azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py +1 -1
  21. azure/ai/evaluation/_evaluators/_content_safety/_sexual.py +1 -1
  22. azure/ai/evaluation/_evaluators/_content_safety/_violence.py +1 -1
  23. azure/ai/evaluation/_evaluators/_eci/_eci.py +2 -2
  24. azure/ai/evaluation/_evaluators/_f1_score/_f1_score.py +2 -1
  25. azure/ai/evaluation/_evaluators/_fluency/_fluency.py +5 -10
  26. azure/ai/evaluation/_evaluators/_gleu/_gleu.py +1 -1
  27. azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +5 -10
  28. azure/ai/evaluation/_evaluators/_meteor/_meteor.py +1 -0
  29. azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py +2 -2
  30. azure/ai/evaluation/_evaluators/_protected_materials/_protected_materials.py +2 -2
  31. azure/ai/evaluation/_evaluators/_qa/_qa.py +3 -14
  32. azure/ai/evaluation/_evaluators/_relevance/_relevance.py +5 -10
  33. azure/ai/evaluation/_evaluators/_rouge/_rouge.py +3 -2
  34. azure/ai/evaluation/_evaluators/_similarity/_similarity.py +5 -10
  35. azure/ai/evaluation/_evaluators/_xpia/xpia.py +1 -2
  36. azure/ai/evaluation/_http_utils.py +3 -3
  37. azure/ai/evaluation/_version.py +1 -1
  38. azure/ai/evaluation/simulator/__init__.py +1 -1
  39. azure/ai/evaluation/simulator/_adversarial_simulator.py +8 -6
  40. azure/ai/evaluation/simulator/_conversation/__init__.py +1 -1
  41. azure/ai/evaluation/simulator/_conversation/_conversation.py +16 -16
  42. azure/ai/evaluation/simulator/_direct_attack_simulator.py +6 -6
  43. azure/ai/evaluation/simulator/_helpers/__init__.py +3 -2
  44. azure/ai/evaluation/simulator/_helpers/_experimental.py +157 -0
  45. azure/ai/evaluation/simulator/_helpers/_simulator_data_classes.py +11 -29
  46. azure/ai/evaluation/simulator/_indirect_attack_simulator.py +6 -6
  47. azure/ai/evaluation/simulator/_model_tools/_proxy_completion_model.py +2 -3
  48. azure/ai/evaluation/simulator/_model_tools/_rai_client.py +18 -11
  49. azure/ai/evaluation/simulator/_model_tools/_template_handler.py +1 -1
  50. azure/ai/evaluation/simulator/_model_tools/models.py +9 -11
  51. azure/ai/evaluation/simulator/_prompty/__init__.py +0 -0
  52. azure/ai/evaluation/simulator/{simulator.py → _simulator.py} +147 -80
  53. azure/ai/evaluation/simulator/_tracing.py +21 -24
  54. azure/ai/evaluation/simulator/_utils.py +4 -1
  55. {azure_ai_evaluation-1.0.0b1.dist-info → azure_ai_evaluation-1.0.0b2.dist-info}/METADATA +86 -14
  56. {azure_ai_evaluation-1.0.0b1.dist-info → azure_ai_evaluation-1.0.0b2.dist-info}/RECORD +58 -56
  57. {azure_ai_evaluation-1.0.0b1.dist-info → azure_ai_evaluation-1.0.0b2.dist-info}/WHEEL +0 -0
  58. {azure_ai_evaluation-1.0.0b1.dist-info → azure_ai_evaluation-1.0.0b2.dist-info}/top_level.txt +0 -0
@@ -25,11 +25,7 @@ from ._evaluators._relevance import RelevanceEvaluator
25
25
  from ._evaluators._rouge import RougeScoreEvaluator, RougeType
26
26
  from ._evaluators._similarity import SimilarityEvaluator
27
27
  from ._evaluators._xpia import IndirectAttackEvaluator
28
- from ._model_configurations import (
29
- AzureAIProject,
30
- AzureOpenAIModelConfiguration,
31
- OpenAIModelConfiguration,
32
- )
28
+ from ._model_configurations import AzureAIProject, AzureOpenAIModelConfiguration, OpenAIModelConfiguration
33
29
 
34
30
  __all__ = [
35
31
  "evaluate",
@@ -11,12 +11,12 @@ from urllib.parse import urlparse
11
11
 
12
12
  import jwt
13
13
  import numpy as np
14
- from azure.core.credentials import TokenCredential
15
- from azure.identity import DefaultAzureCredential
16
14
 
15
+ from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
17
16
  from azure.ai.evaluation._http_utils import get_async_http_client
18
- from azure.ai.evaluation._exceptions import EvaluationException, ErrorBlame, ErrorCategory, ErrorTarget
19
17
  from azure.ai.evaluation._model_configurations import AzureAIProject
18
+ from azure.core.credentials import TokenCredential
19
+ from azure.identity import DefaultAzureCredential
20
20
 
21
21
  from .constants import (
22
22
  CommonConstants,
@@ -348,7 +348,7 @@ async def _get_service_discovery_url(azure_ai_project: AzureAIProject, token: st
348
348
  )
349
349
 
350
350
  if response.status_code != 200:
351
- msg = f"Failed to retrieve the discovery service URL."
351
+ msg = "Failed to retrieve the discovery service URL."
352
352
  raise EvaluationException(
353
353
  message=msg,
354
354
  internal_message=msg,
@@ -2,20 +2,15 @@
2
2
  # Copyright (c) Microsoft Corporation. All rights reserved.
3
3
  # ---------------------------------------------------------
4
4
 
5
- from typing import Optional, Union
6
-
7
- from azure.ai.evaluation._model_configurations import AzureOpenAIModelConfiguration, OpenAIModelConfiguration
5
+ import threading
6
+ from typing import List, Optional, Union
8
7
 
9
- try:
10
- from . import constants
11
- except ImportError:
12
- import constants
8
+ import nltk
9
+ import numpy as np
13
10
 
14
- from typing import List
11
+ from azure.ai.evaluation._model_configurations import AzureOpenAIModelConfiguration, OpenAIModelConfiguration
15
12
 
16
- import threading
17
- import numpy as np
18
- import nltk
13
+ from . import constants
19
14
 
20
15
  _nltk_data_download_lock = threading.Lock()
21
16
 
@@ -46,7 +41,7 @@ def ensure_nltk_data_downloaded():
46
41
  """Download NLTK data packages if not already downloaded."""
47
42
  with _nltk_data_download_lock:
48
43
  try:
49
- from nltk.tokenize.nist import NISTTokenizer
44
+ from nltk.tokenize.nist import NISTTokenizer # pylint: disable=unused-import
50
45
  except LookupError:
51
46
  nltk.download("perluniprops")
52
47
  nltk.download("punkt")
@@ -54,12 +49,19 @@ def ensure_nltk_data_downloaded():
54
49
 
55
50
 
56
51
  def nltk_tokenize(text: str) -> List[str]:
57
- """Tokenize the input text using the NLTK tokenizer."""
52
+ """Tokenize the input text using the NLTK tokenizer.
53
+
54
+ :param text: The text to tokenize
55
+ :type text: str
56
+ :return: A list of tokens
57
+ :rtype: list[str]
58
+ """
58
59
  ensure_nltk_data_downloaded()
59
60
 
60
61
  if not text.isascii():
61
62
  # Use NISTTokenizer for international tokenization
62
63
  from nltk.tokenize.nist import NISTTokenizer
64
+
63
65
  tokens = NISTTokenizer().international_tokenize(text)
64
66
  else:
65
67
  # By default, use NLTK word tokenizer
@@ -68,20 +70,18 @@ def nltk_tokenize(text: str) -> List[str]:
68
70
  return list(tokens)
69
71
 
70
72
 
71
- def check_and_add_api_version_for_aoai_model_config(
73
+ def ensure_api_version_in_aoai_model_config(
72
74
  model_config: Union[AzureOpenAIModelConfiguration, OpenAIModelConfiguration],
73
75
  default_api_version: str,
74
76
  ) -> None:
75
- if (
76
- "azure_endpoint" in model_config or "azure_deployment" in model_config
77
- ):
77
+ if "azure_endpoint" in model_config or "azure_deployment" in model_config:
78
78
  model_config["api_version"] = model_config.get("api_version", default_api_version)
79
79
 
80
80
 
81
- def check_and_add_user_agent_for_aoai_model_config(
81
+ def ensure_user_agent_in_aoai_model_config(
82
82
  model_config: Union[AzureOpenAIModelConfiguration, OpenAIModelConfiguration],
83
83
  prompty_model_config: dict,
84
84
  user_agent: Optional[str] = None,
85
85
  ) -> None:
86
86
  if user_agent and ("azure_endpoint" in model_config or "azure_deployment" in model_config):
87
- prompty_model_config["parameters"]["extra_headers"].update({"x-ms-useragent": user_agent})
87
+ prompty_model_config["parameters"]["extra_headers"].update({"x-ms-useragent": user_agent})
@@ -39,6 +39,15 @@ class Prefixes:
39
39
  TSG_OUTPUTS = "__outputs."
40
40
 
41
41
 
42
+ class DefaultOpenEncoding:
43
+ """Enum that captures SDK's default values for the encoding param of open(...)"""
44
+
45
+ READ = "utf-8-sig"
46
+ """SDK Default Encoding when reading a file"""
47
+ WRITE = "utf-8"
48
+ """SDK Default Encoding when writing a file"""
49
+
50
+
42
51
  DEFAULT_EVALUATION_RESULTS_FILE_NAME = "evaluation_results.json"
43
52
 
44
53
  CONTENT_SAFETY_DEFECT_RATE_THRESHOLD_DEFAULT = 4
@@ -5,13 +5,14 @@ import os
5
5
 
6
6
  from promptflow._sdk._constants import PF_FLOW_ENTRY_IN_TMP, PF_FLOW_META_LOAD_IN_SUBPROCESS
7
7
  from promptflow._utils.user_agent_utils import ClientUserAgentUtil
8
+ from promptflow.tracing._integrations._openai_injector import inject_openai_api, recover_openai_api
9
+
8
10
  from azure.ai.evaluation._constants import (
9
11
  OTEL_EXPORTER_OTLP_TRACES_TIMEOUT,
10
12
  OTEL_EXPORTER_OTLP_TRACES_TIMEOUT_DEFAULT,
11
13
  PF_BATCH_TIMEOUT_SEC,
12
14
  PF_BATCH_TIMEOUT_SEC_DEFAULT,
13
15
  )
14
- from promptflow.tracing._integrations._openai_injector import inject_openai_api, recover_openai_api
15
16
 
16
17
  from ..._user_agent import USER_AGENT
17
18
  from .._utils import set_event_loop_policy
@@ -4,13 +4,16 @@
4
4
  import inspect
5
5
  import json
6
6
  import logging
7
+ import os
8
+ from pathlib import Path
9
+ from typing import Callable, Dict, Optional, Union
7
10
 
8
11
  import pandas as pd
9
-
10
12
  from promptflow.contracts.types import AttrDict
11
- from azure.ai.evaluation._evaluate._utils import _apply_column_mapping, _has_aggregator, get_int_env_var, load_jsonl
12
13
  from promptflow.tracing import ThreadPoolExecutorWithContext as ThreadPoolExecutor
13
- from azure.ai.evaluation._exceptions import EvaluationException, ErrorBlame, ErrorCategory, ErrorTarget
14
+
15
+ from azure.ai.evaluation._evaluate._utils import _apply_column_mapping, _has_aggregator, get_int_env_var, load_jsonl
16
+ from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
14
17
 
15
18
  from ..._constants import PF_BATCH_TIMEOUT_SEC, PF_BATCH_TIMEOUT_SEC_DEFAULT
16
19
 
@@ -18,7 +21,9 @@ LOGGER = logging.getLogger(__name__)
18
21
 
19
22
 
20
23
  class CodeRun:
21
- def __init__(self, run, input_data, evaluator_name=None, aggregated_metrics=None, **kwargs):
24
+ def __init__(
25
+ self, run, input_data, evaluator_name=None, aggregated_metrics=None, **kwargs # pylint: disable=unused-argument
26
+ ):
22
27
  self.run = run
23
28
  self.evaluator_name = evaluator_name if evaluator_name is not None else ""
24
29
  self.input_data = input_data
@@ -40,13 +45,13 @@ class CodeRun:
40
45
  else None
41
46
  )
42
47
  except Exception as ex: # pylint: disable=broad-exception-caught
43
- LOGGER.debug(f"Error calculating metrics for evaluator {self.evaluator_name}, failed with error {str(ex)}")
48
+ LOGGER.debug("Error calculating metrics for evaluator %s, failed with error %s", self.evaluator_name, ex)
44
49
  aggregated_metrics = None
45
50
 
46
51
  if not isinstance(aggregated_metrics, dict):
47
52
  LOGGER.warning(
48
- f"Aggregated metrics for evaluator {self.evaluator_name}"
49
- f" is not a dictionary will not be logged as metrics"
53
+ "Aggregated metrics for evaluator %s is not a dictionary will not be logged as metrics",
54
+ self.evaluator_name,
50
55
  )
51
56
 
52
57
  aggregated_metrics = aggregated_metrics if isinstance(aggregated_metrics, dict) else {}
@@ -54,11 +59,15 @@ class CodeRun:
54
59
  return aggregated_metrics
55
60
 
56
61
 
57
- class CodeClient:
58
- def __init__(self):
62
+ class CodeClient: # pylint: disable=client-accepts-api-version-keyword
63
+ def __init__( # pylint: disable=missing-client-constructor-parameter-credential,missing-client-constructor-parameter-kwargs
64
+ self,
65
+ ) -> None:
59
66
  self._thread_pool = ThreadPoolExecutor(thread_name_prefix="evaluators_thread")
60
67
 
61
- def _calculate_metric(self, evaluator, input_df, column_mapping, evaluator_name):
68
+ def _calculate_metric(
69
+ self, evaluator: Callable, input_df: pd.DataFrame, column_mapping: Optional[Dict[str, str]], evaluator_name: str
70
+ ) -> pd.DataFrame:
62
71
  row_metric_futures = []
63
72
  row_metric_results = []
64
73
  input_df = _apply_column_mapping(input_df, column_mapping)
@@ -110,18 +119,25 @@ class CodeClient:
110
119
  return aggregated_output
111
120
  except Exception as ex: # pylint: disable=broad-exception-caught
112
121
  LOGGER.warning(
113
- f"Error calculating aggregations for evaluator {run.evaluator_name}," f" failed with error {str(ex)}"
122
+ "Error calculating aggregations for evaluator %s, failed with error %s", run.evaluator_name, ex
114
123
  )
115
124
  return None
116
125
 
117
- def run(self, flow, data, evaluator_name=None, column_mapping=None, **kwargs):
126
+ def run(
127
+ self, # pylint: disable=unused-argument
128
+ flow: Callable,
129
+ data: Union[os.PathLike, Path, pd.DataFrame],
130
+ evaluator_name: Optional[str] = None,
131
+ column_mapping: Optional[Dict[str, str]] = None,
132
+ **kwargs,
133
+ ) -> CodeRun:
118
134
  input_df = data
119
135
  if not isinstance(input_df, pd.DataFrame):
120
136
  try:
121
137
  json_data = load_jsonl(data)
122
138
  except json.JSONDecodeError as exc:
123
139
  raise EvaluationException(
124
- message = f"Failed to parse data as JSON: {data}. Provide valid json lines data.",
140
+ message=f"Failed to parse data as JSON: {data}. Provide valid json lines data.",
125
141
  internal_message="Failed to parse data as JSON",
126
142
  target=ErrorTarget.CODE_CLIENT,
127
143
  category=ErrorCategory.INVALID_VALUE,
@@ -129,22 +145,28 @@ class CodeClient:
129
145
  ) from exc
130
146
 
131
147
  input_df = pd.DataFrame(json_data)
132
- eval_future = self._thread_pool.submit(self._calculate_metric, flow, input_df, column_mapping, evaluator_name)
148
+ eval_future = self._thread_pool.submit(
149
+ self._calculate_metric,
150
+ evaluator=flow,
151
+ input_df=input_df,
152
+ column_mapping=column_mapping,
153
+ evaluator_name=evaluator_name,
154
+ )
133
155
  run = CodeRun(run=eval_future, input_data=data, evaluator_name=evaluator_name, aggregated_metrics=None)
134
156
  aggregation_future = self._thread_pool.submit(self._calculate_aggregations, evaluator=flow, run=run)
135
157
  run.aggregated_metrics = aggregation_future
136
158
  return run
137
159
 
138
- def get_details(self, run, all_results=False):
160
+ def get_details(self, run: CodeRun, all_results: bool = False) -> pd.DataFrame:
139
161
  result_df = run.get_result_df(exclude_inputs=not all_results)
140
162
  return result_df
141
163
 
142
- def get_metrics(self, run):
164
+ def get_metrics(self, run: CodeRun) -> Optional[None]:
143
165
  try:
144
166
  aggregated_metrics = run.get_aggregated_metrics()
145
167
  print("Aggregated metrics")
146
168
  print(aggregated_metrics)
147
169
  except Exception as ex: # pylint: disable=broad-exception-caught
148
- LOGGER.debug(f"Error calculating metrics for evaluator {run.evaluator_name}, failed with error {str(ex)}")
170
+ LOGGER.debug("Error calculating metrics for evaluator %s, failed with error %s", run.evaluator_name, ex)
149
171
  return None
150
172
  return aggregated_metrics
@@ -4,29 +4,40 @@
4
4
  import inspect
5
5
  import logging
6
6
  import os
7
+ from concurrent.futures import Future
8
+ from typing import Any, Callable, Dict, Optional, Union
7
9
 
8
10
  import numpy as np
9
-
11
+ import pandas as pd
10
12
  from promptflow.client import PFClient
13
+ from promptflow.entities import Run
11
14
  from promptflow.tracing import ThreadPoolExecutorWithContext as ThreadPoolExecutor
12
15
 
13
16
  LOGGER = logging.getLogger(__name__)
14
17
 
15
18
 
16
19
  class ProxyRun:
17
- def __init__(self, run, **kwargs):
20
+ def __init__(self, run: Future, **kwargs) -> None: # pylint: disable=unused-argument
18
21
  self.run = run
19
22
 
20
23
 
21
- class ProxyClient:
22
- def __init__(self, pf_client: PFClient):
24
+ class ProxyClient: # pylint: disable=client-accepts-api-version-keyword
25
+ def __init__( # pylint: disable=missing-client-constructor-parameter-credential,missing-client-constructor-parameter-kwargs
26
+ self, pf_client: PFClient
27
+ ) -> None:
23
28
  self._pf_client = pf_client
24
29
  self._thread_pool = ThreadPoolExecutor(thread_name_prefix="evaluators_thread")
25
30
 
26
- def run(self, flow, data, column_mapping=None, **kwargs):
31
+ def run(
32
+ self,
33
+ flow: Union[str, os.PathLike, Callable],
34
+ data: Union[str, os.PathLike],
35
+ column_mapping: Optional[Dict[str, str]] = None,
36
+ **kwargs
37
+ ) -> ProxyRun:
27
38
  flow_to_run = flow
28
39
  if hasattr(flow, "_to_async"):
29
- flow_to_run = flow._to_async()
40
+ flow_to_run = flow._to_async() # pylint: disable=protected-access
30
41
 
31
42
  batch_use_async = self._should_batch_use_async(flow_to_run)
32
43
  eval_future = self._thread_pool.submit(
@@ -39,14 +50,14 @@ class ProxyClient:
39
50
  )
40
51
  return ProxyRun(run=eval_future)
41
52
 
42
- def get_details(self, proxy_run, all_results=False):
43
- run = proxy_run.run.result()
53
+ def get_details(self, proxy_run: ProxyRun, all_results: bool = False) -> pd.DataFrame:
54
+ run: Run = proxy_run.run.result()
44
55
  result_df = self._pf_client.get_details(run, all_results=all_results)
45
56
  result_df.replace("(Failed)", np.nan, inplace=True)
46
57
  return result_df
47
58
 
48
- def get_metrics(self, proxy_run):
49
- run = proxy_run.run.result()
59
+ def get_metrics(self, proxy_run: ProxyRun) -> Dict[str, Any]:
60
+ run: Run = proxy_run.run.result()
50
61
  return self._pf_client.get_metrics(run)
51
62
 
52
63
  @staticmethod
@@ -54,8 +65,7 @@ class ProxyClient:
54
65
  if os.getenv("PF_EVALS_BATCH_USE_ASYNC", "true").lower() == "true":
55
66
  if hasattr(flow, "__call__") and inspect.iscoroutinefunction(flow.__call__):
56
67
  return True
57
- elif inspect.iscoroutinefunction(flow):
68
+ if inspect.iscoroutinefunction(flow):
58
69
  return True
59
- else:
60
- return False
70
+ return False
61
71
  return False
@@ -8,17 +8,18 @@ import logging
8
8
  import os
9
9
  import posixpath
10
10
  import time
11
+ import types
11
12
  import uuid
12
- from typing import Any, Dict, Optional, Set
13
+ from typing import Any, Dict, Optional, Set, Type
13
14
  from urllib.parse import urlparse
14
15
 
15
- from azure.core.pipeline.policies import RetryPolicy
16
- from azure.core.rest import HttpResponse
17
-
18
16
  from promptflow._sdk.entities import Run
17
+
18
+ from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
19
19
  from azure.ai.evaluation._http_utils import get_http_client
20
20
  from azure.ai.evaluation._version import VERSION
21
- from azure.ai.evaluation._exceptions import EvaluationException, ErrorBlame, ErrorCategory, ErrorTarget
21
+ from azure.core.pipeline.policies import RetryPolicy
22
+ from azure.core.rest import HttpResponse
22
23
 
23
24
  LOGGER = logging.getLogger(__name__)
24
25
 
@@ -165,7 +166,9 @@ class EvalRun(contextlib.AbstractContextManager): # pylint: disable=too-many-in
165
166
  self._url_base = urlparse(self._tracking_uri).netloc
166
167
  if self._promptflow_run is not None:
167
168
  self.info = RunInfo(
168
- self._promptflow_run.name, self._promptflow_run._experiment_name, self._promptflow_run.name
169
+ self._promptflow_run.name,
170
+ self._promptflow_run._experiment_name, # pylint: disable=protected-access
171
+ self._promptflow_run.name,
169
172
  )
170
173
  else:
171
174
  url = f"https://{self._url_base}/mlflow/v2.0" f"{self._get_scope()}/api/2.0/mlflow/runs/create"
@@ -181,8 +184,10 @@ class EvalRun(contextlib.AbstractContextManager): # pylint: disable=too-many-in
181
184
  if response.status_code != 200:
182
185
  self.info = RunInfo.generate(self._run_name)
183
186
  LOGGER.warning(
184
- f"The run failed to start: {response.status_code}: {response.text()}."
185
- "The results will be saved locally, but will not be logged to Azure."
187
+ "The run failed to start: %s: %s."
188
+ "The results will be saved locally, but will not be logged to Azure.",
189
+ response.status_code,
190
+ response.text(),
186
191
  )
187
192
  self._status = RunStatus.BROKEN
188
193
  else:
@@ -216,7 +221,7 @@ class EvalRun(contextlib.AbstractContextManager): # pylint: disable=too-many-in
216
221
  internal_message="Incorrect terminal status. Valid statuses are 'FINISHED', 'FAILED' and 'KILLED'",
217
222
  target=ErrorTarget.EVAL_RUN,
218
223
  category=ErrorCategory.FAILED_EXECUTION,
219
- blame=ErrorBlame.UNKNOWN
224
+ blame=ErrorBlame.UNKNOWN,
220
225
  )
221
226
  url = f"https://{self._url_base}/mlflow/v2.0" f"{self._get_scope()}/api/2.0/mlflow/runs/update"
222
227
  body = {
@@ -239,8 +244,21 @@ class EvalRun(contextlib.AbstractContextManager): # pylint: disable=too-many-in
239
244
  self._start_run()
240
245
  return self
241
246
 
242
- def __exit__(self, exc_type, exc_value, exc_tb):
243
- """The context manager exit call."""
247
+ def __exit__(
248
+ self,
249
+ exc_type: Optional[Type[BaseException]],
250
+ exc_value: Optional[BaseException],
251
+ exc_tb: Optional[types.TracebackType],
252
+ ) -> Optional[bool]:
253
+ """The context manager exit call.
254
+
255
+ :param exc_type: The exception type
256
+ :type exc_type: Optional[Type[BaseException]]
257
+ :param exc_value: The exception value
258
+ :type exc_value: Optional[BaseException]
259
+ :param exc_tb: The exception traceback
260
+ :type exc_tb: Optional[types.TracebackType]
261
+ """
244
262
  self._end_run("FINISHED")
245
263
 
246
264
  def get_run_history_uri(self) -> str:
@@ -280,7 +298,7 @@ class EvalRun(contextlib.AbstractContextManager): # pylint: disable=too-many-in
280
298
  # is an optional dependency.
281
299
  from promptflow.azure._utils._token_cache import ArmTokenCache # pylint: disable=import-error,no-name-in-module
282
300
 
283
- return ArmTokenCache().get_token(self._ml_client._credential)
301
+ return ArmTokenCache().get_token(self._ml_client._credential) # pylint: disable=protected-access
284
302
 
285
303
  def request_with_retry(
286
304
  self, url: str, method: str, json_dict: Dict[str, Any], headers: Optional[Dict[str, str]] = None
@@ -326,9 +344,10 @@ class EvalRun(contextlib.AbstractContextManager): # pylint: disable=too-many-in
326
344
  :type response: HttpResponse
327
345
  """
328
346
  LOGGER.warning(
329
- f"Unable to {failed_op}, "
330
- f"the request failed with status code {response.status_code}, "
331
- f"{response.text()=}."
347
+ "Unable to %s, the request failed with status code %s, response.text()=%s.",
348
+ failed_op,
349
+ response.status_code,
350
+ response.text(),
332
351
  )
333
352
 
334
353
  def _check_state_and_log(self, action: str, bad_states: Set[RunStatus], should_raise: bool) -> bool:
@@ -342,7 +361,8 @@ class EvalRun(contextlib.AbstractContextManager): # pylint: disable=too-many-in
342
361
  :type bad_states: Set[RunStatus]
343
362
  :param should_raise: Should we raise an error if the bad state has been encountered
344
363
  :type should_raise: bool
345
- :raises: ~azure.ai.evaluations._exceptions.EvaluationException if should_raise is True and invalid state was encountered.
364
+ :raises: ~azure.ai.evaluations._exceptions.EvaluationException if should_raise is True
365
+ and invalid state was encountered.
346
366
  :return: Whether or not run is in the correct state.
347
367
  :rtype: bool
348
368
  """
@@ -354,7 +374,7 @@ class EvalRun(contextlib.AbstractContextManager): # pylint: disable=too-many-in
354
374
  internal_message=msg,
355
375
  target=ErrorTarget.EVAL_RUN,
356
376
  category=ErrorCategory.FAILED_EXECUTION,
357
- blame=ErrorBlame.UNKNOWN
377
+ blame=ErrorBlame.UNKNOWN,
358
378
  )
359
379
  LOGGER.warning(msg)
360
380
  return False
@@ -446,7 +466,7 @@ class EvalRun(contextlib.AbstractContextManager): # pylint: disable=too-many-in
446
466
  return credential.account_key
447
467
  if hasattr(credential, "sas_token"):
448
468
  return credential.sas_token
449
- return self._ml_client.datastores._credential
469
+ return self._ml_client.datastores._credential # pylint: disable=protected-access
450
470
 
451
471
  def log_metric(self, key: str, value: float) -> None:
452
472
  """
@@ -8,27 +8,26 @@ from typing import Any, Callable, Dict, List, Optional, Set, Tuple, Type
8
8
 
9
9
  import numpy as np
10
10
  import pandas as pd
11
-
12
11
  from promptflow._sdk._constants import LINE_NUMBER
13
12
  from promptflow.client import PFClient
14
13
 
15
- from .._model_configurations import AzureAIProject
14
+ from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
15
+
16
16
  from .._constants import (
17
17
  CONTENT_SAFETY_DEFECT_RATE_THRESHOLD_DEFAULT,
18
18
  EvaluationMetrics,
19
19
  Prefixes,
20
20
  _InternalEvaluationMetrics,
21
21
  )
22
+ from .._model_configurations import AzureAIProject
22
23
  from .._user_agent import USER_AGENT
23
24
  from ._batch_run_client import BatchRunContext, CodeClient, ProxyClient
24
- from ._telemetry import log_evaluate_activity
25
25
  from ._utils import (
26
26
  _apply_column_mapping,
27
27
  _log_metrics_and_instance_results,
28
28
  _trace_destination_from_project_scope,
29
29
  _write_output,
30
30
  )
31
- from azure.ai.evaluation._exceptions import EvaluationException, ErrorBlame, ErrorCategory, ErrorTarget
32
31
 
33
32
 
34
33
  # pylint: disable=line-too-long
@@ -260,12 +259,12 @@ def _validate_and_load_data(target, data, evaluators, output_path, azure_ai_proj
260
259
  initial_data_df = pd.read_json(data, lines=True)
261
260
  except Exception as e:
262
261
  raise EvaluationException(
263
- message=f"Failed to load data from {data}. Confirm that it is valid jsonl data. Error: {str(e)}.",
264
- internal_message="Failed to load data. Confirm that it is valid jsonl data.",
265
- target=ErrorTarget.EVALUATE,
266
- category=ErrorCategory.INVALID_VALUE,
267
- blame=ErrorBlame.USER_ERROR,
268
- ) from e
262
+ message=f"Failed to load data from {data}. Confirm that it is valid jsonl data. Error: {str(e)}.",
263
+ internal_message="Failed to load data. Confirm that it is valid jsonl data.",
264
+ target=ErrorTarget.EVALUATE,
265
+ category=ErrorCategory.INVALID_VALUE,
266
+ blame=ErrorBlame.USER_ERROR,
267
+ ) from e
269
268
 
270
269
  return initial_data_df
271
270
 
@@ -436,10 +435,10 @@ def _rename_columns_conditionally(df: pd.DataFrame) -> pd.DataFrame:
436
435
  # @log_evaluate_activity
437
436
  def evaluate(
438
437
  *,
438
+ data: str,
439
+ evaluators: Dict[str, Callable],
439
440
  evaluation_name: Optional[str] = None,
440
441
  target: Optional[Callable] = None,
441
- data: Optional[str] = None,
442
- evaluators: Optional[Dict[str, Callable]] = None,
443
442
  evaluator_config: Optional[Dict[str, Dict[str, str]]] = None,
444
443
  azure_ai_project: Optional[AzureAIProject] = None,
445
444
  output_path: Optional[str] = None,
@@ -448,16 +447,16 @@ def evaluate(
448
447
  """Evaluates target or data with built-in or custom evaluators. If both target and data are provided,
449
448
  data will be run through target function and then results will be evaluated.
450
449
 
450
+ :keyword data: Path to the data to be evaluated or passed to target if target is set.
451
+ Only .jsonl format files are supported. `target` and `data` both cannot be None. Required.
452
+ :paramtype data: str
453
+ :keyword evaluators: Evaluators to be used for evaluation. It should be a dictionary with key as alias for evaluator
454
+ and value as the evaluator function. Required.
455
+ :paramtype evaluators: Dict[str, Callable]
451
456
  :keyword evaluation_name: Display name of the evaluation.
452
457
  :paramtype evaluation_name: Optional[str]
453
458
  :keyword target: Target to be evaluated. `target` and `data` both cannot be None
454
459
  :paramtype target: Optional[Callable]
455
- :keyword data: Path to the data to be evaluated or passed to target if target is set.
456
- Only .jsonl format files are supported. `target` and `data` both cannot be None
457
- :paramtype data: Optional[str]
458
- :keyword evaluators: Evaluators to be used for evaluation. It should be a dictionary with key as alias for evaluator
459
- and value as the evaluator function.
460
- :paramtype evaluators: Optional[Dict[str, Callable]
461
460
  :keyword evaluator_config: Configuration for evaluators. The configuration should be a dictionary with evaluator
462
461
  names as keys and a dictionary of column mappings as values. The column mappings should be a dictionary with
463
462
  keys as the column names in the evaluator input and values as the column names in the input data or data
@@ -572,22 +571,21 @@ def _evaluate( # pylint: disable=too-many-locals
572
571
  user_agent=USER_AGENT,
573
572
  )
574
573
 
575
- trace_destination = pf_client._config.get_trace_destination()
576
-
574
+ trace_destination = pf_client._config.get_trace_destination() # pylint: disable=protected-access
577
575
  target_run = None
578
-
579
576
  target_generated_columns = set()
577
+
578
+ # Create default configuration for evaluators that directly maps
579
+ # input data names to keyword inputs of the same name in the evaluators.
580
+ evaluator_config = evaluator_config or {}
581
+ evaluator_config.setdefault("default", {})
582
+
583
+ # If target is set, apply 1-1 column mapping from target outputs to evaluator inputs
580
584
  if data is not None and target is not None:
581
585
  input_data_df, target_generated_columns, target_run = _apply_target_to_data(
582
586
  target, data, pf_client, input_data_df, evaluation_name, _run_name=kwargs.get("_run_name")
583
587
  )
584
588
 
585
- # Make sure, the default is always in the configuration.
586
- if not evaluator_config:
587
- evaluator_config = {}
588
- if "default" not in evaluator_config:
589
- evaluator_config["default"] = {}
590
-
591
589
  for evaluator_name, mapping in evaluator_config.items():
592
590
  mapped_to_values = set(mapping.values())
593
591
  for col in target_generated_columns:
@@ -604,6 +602,16 @@ def _evaluate( # pylint: disable=too-many-locals
604
602
  # everything we need for evaluators.
605
603
  _validate_columns(input_data_df, evaluators, target=None, evaluator_config=evaluator_config)
606
604
 
605
+ # Apply 1-1 mapping from input data to evaluator inputs, excluding values already assigned
606
+ # via target mapping.
607
+ # If both the data and the output dictionary of the target function
608
+ # have the same column, then the target function value is used.
609
+ if input_data_df is not None:
610
+ for col in input_data_df.columns:
611
+ # Ignore columns added by target mapping. These are formatted as "__outputs.<column_name>"
612
+ # Also ignore columns that are already in config, since they've been covered by target mapping.
613
+ if not col.startswith(Prefixes.TSG_OUTPUTS) and col not in evaluator_config["default"].keys():
614
+ evaluator_config["default"][col] = f"${{data.{col}}}"
607
615
  # Batch Run
608
616
  evaluators_info = {}
609
617
  use_pf_client = kwargs.get("_use_pf_client", True)
@@ -672,7 +680,6 @@ def _evaluate( # pylint: disable=too-many-locals
672
680
  result_df = pd.concat([input_data_df, evaluators_result_df], axis=1, verify_integrity=True)
673
681
  metrics = _aggregate_metrics(evaluators_result_df, evaluators)
674
682
  metrics.update(evaluators_metric)
675
-
676
683
  studio_url = _log_metrics_and_instance_results(
677
684
  metrics,
678
685
  result_df,