azure-ai-evaluation 1.3.0__py3-none-any.whl → 1.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of azure-ai-evaluation might be problematic. Click here for more details.

Files changed (142) hide show
  1. azure/ai/evaluation/__init__.py +27 -1
  2. azure/ai/evaluation/_azure/_models.py +6 -6
  3. azure/ai/evaluation/_common/constants.py +6 -2
  4. azure/ai/evaluation/_common/rai_service.py +39 -5
  5. azure/ai/evaluation/_common/raiclient/__init__.py +34 -0
  6. azure/ai/evaluation/_common/raiclient/_client.py +128 -0
  7. azure/ai/evaluation/_common/raiclient/_configuration.py +87 -0
  8. azure/ai/evaluation/_common/raiclient/_model_base.py +1235 -0
  9. azure/ai/evaluation/_common/raiclient/_patch.py +20 -0
  10. azure/ai/evaluation/_common/raiclient/_serialization.py +2050 -0
  11. azure/ai/evaluation/_common/raiclient/_version.py +9 -0
  12. azure/ai/evaluation/_common/raiclient/aio/__init__.py +29 -0
  13. azure/ai/evaluation/_common/raiclient/aio/_client.py +130 -0
  14. azure/ai/evaluation/_common/raiclient/aio/_configuration.py +87 -0
  15. azure/ai/evaluation/_common/raiclient/aio/_patch.py +20 -0
  16. azure/ai/evaluation/_common/raiclient/aio/operations/__init__.py +25 -0
  17. azure/ai/evaluation/_common/raiclient/aio/operations/_operations.py +981 -0
  18. azure/ai/evaluation/_common/raiclient/aio/operations/_patch.py +20 -0
  19. azure/ai/evaluation/_common/raiclient/models/__init__.py +60 -0
  20. azure/ai/evaluation/_common/raiclient/models/_enums.py +18 -0
  21. azure/ai/evaluation/_common/raiclient/models/_models.py +651 -0
  22. azure/ai/evaluation/_common/raiclient/models/_patch.py +20 -0
  23. azure/ai/evaluation/_common/raiclient/operations/__init__.py +25 -0
  24. azure/ai/evaluation/_common/raiclient/operations/_operations.py +1225 -0
  25. azure/ai/evaluation/_common/raiclient/operations/_patch.py +20 -0
  26. azure/ai/evaluation/_common/raiclient/py.typed +1 -0
  27. azure/ai/evaluation/_common/utils.py +23 -3
  28. azure/ai/evaluation/_constants.py +7 -0
  29. azure/ai/evaluation/_converters/__init__.py +3 -0
  30. azure/ai/evaluation/_converters/_ai_services.py +804 -0
  31. azure/ai/evaluation/_converters/_models.py +302 -0
  32. azure/ai/evaluation/_evaluate/_batch_run/__init__.py +10 -3
  33. azure/ai/evaluation/_evaluate/_batch_run/_run_submitter_client.py +104 -0
  34. azure/ai/evaluation/_evaluate/_batch_run/batch_clients.py +82 -0
  35. azure/ai/evaluation/_evaluate/_batch_run/code_client.py +18 -12
  36. azure/ai/evaluation/_evaluate/_batch_run/eval_run_context.py +9 -4
  37. azure/ai/evaluation/_evaluate/_batch_run/proxy_client.py +42 -22
  38. azure/ai/evaluation/_evaluate/_batch_run/target_run_context.py +1 -1
  39. azure/ai/evaluation/_evaluate/_eval_run.py +2 -2
  40. azure/ai/evaluation/_evaluate/_evaluate.py +109 -64
  41. azure/ai/evaluation/_evaluate/_telemetry/__init__.py +5 -89
  42. azure/ai/evaluation/_evaluate/_utils.py +3 -3
  43. azure/ai/evaluation/_evaluators/_bleu/_bleu.py +23 -3
  44. azure/ai/evaluation/_evaluators/_code_vulnerability/__init__.py +5 -0
  45. azure/ai/evaluation/_evaluators/_code_vulnerability/_code_vulnerability.py +120 -0
  46. azure/ai/evaluation/_evaluators/_coherence/_coherence.py +21 -2
  47. azure/ai/evaluation/_evaluators/_common/_base_eval.py +44 -4
  48. azure/ai/evaluation/_evaluators/_common/_base_multi_eval.py +4 -2
  49. azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +44 -5
  50. azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py +16 -4
  51. azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py +42 -5
  52. azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py +15 -0
  53. azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py +15 -0
  54. azure/ai/evaluation/_evaluators/_content_safety/_sexual.py +15 -0
  55. azure/ai/evaluation/_evaluators/_content_safety/_violence.py +15 -0
  56. azure/ai/evaluation/_evaluators/_f1_score/_f1_score.py +28 -4
  57. azure/ai/evaluation/_evaluators/_fluency/_fluency.py +21 -2
  58. azure/ai/evaluation/_evaluators/_gleu/_gleu.py +26 -3
  59. azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +22 -4
  60. azure/ai/evaluation/_evaluators/_intent_resolution/__init__.py +7 -0
  61. azure/ai/evaluation/_evaluators/_intent_resolution/_intent_resolution.py +152 -0
  62. azure/ai/evaluation/_evaluators/_intent_resolution/intent_resolution.prompty +161 -0
  63. azure/ai/evaluation/_evaluators/_meteor/_meteor.py +26 -3
  64. azure/ai/evaluation/_evaluators/_qa/_qa.py +51 -7
  65. azure/ai/evaluation/_evaluators/_relevance/_relevance.py +26 -2
  66. azure/ai/evaluation/_evaluators/_response_completeness/__init__.py +7 -0
  67. azure/ai/evaluation/_evaluators/_response_completeness/_response_completeness.py +158 -0
  68. azure/ai/evaluation/_evaluators/_response_completeness/response_completeness.prompty +99 -0
  69. azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py +21 -2
  70. azure/ai/evaluation/_evaluators/_rouge/_rouge.py +113 -4
  71. azure/ai/evaluation/_evaluators/_service_groundedness/_service_groundedness.py +23 -3
  72. azure/ai/evaluation/_evaluators/_similarity/_similarity.py +24 -5
  73. azure/ai/evaluation/_evaluators/_task_adherence/__init__.py +7 -0
  74. azure/ai/evaluation/_evaluators/_task_adherence/_task_adherence.py +148 -0
  75. azure/ai/evaluation/_evaluators/_task_adherence/task_adherence.prompty +117 -0
  76. azure/ai/evaluation/_evaluators/_tool_call_accuracy/__init__.py +9 -0
  77. azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py +292 -0
  78. azure/ai/evaluation/_evaluators/_tool_call_accuracy/tool_call_accuracy.prompty +71 -0
  79. azure/ai/evaluation/_evaluators/_ungrounded_attributes/__init__.py +5 -0
  80. azure/ai/evaluation/_evaluators/_ungrounded_attributes/_ungrounded_attributes.py +103 -0
  81. azure/ai/evaluation/_evaluators/_xpia/xpia.py +2 -0
  82. azure/ai/evaluation/_exceptions.py +5 -0
  83. azure/ai/evaluation/_legacy/__init__.py +3 -0
  84. azure/ai/evaluation/_legacy/_adapters/__init__.py +21 -0
  85. azure/ai/evaluation/_legacy/_adapters/_configuration.py +45 -0
  86. azure/ai/evaluation/_legacy/_adapters/_constants.py +10 -0
  87. azure/ai/evaluation/_legacy/_adapters/_errors.py +29 -0
  88. azure/ai/evaluation/_legacy/_adapters/_flows.py +28 -0
  89. azure/ai/evaluation/_legacy/_adapters/_service.py +16 -0
  90. azure/ai/evaluation/_legacy/_adapters/client.py +51 -0
  91. azure/ai/evaluation/_legacy/_adapters/entities.py +26 -0
  92. azure/ai/evaluation/_legacy/_adapters/tracing.py +28 -0
  93. azure/ai/evaluation/_legacy/_adapters/types.py +15 -0
  94. azure/ai/evaluation/_legacy/_adapters/utils.py +31 -0
  95. azure/ai/evaluation/_legacy/_batch_engine/__init__.py +9 -0
  96. azure/ai/evaluation/_legacy/_batch_engine/_config.py +45 -0
  97. azure/ai/evaluation/_legacy/_batch_engine/_engine.py +368 -0
  98. azure/ai/evaluation/_legacy/_batch_engine/_exceptions.py +88 -0
  99. azure/ai/evaluation/_legacy/_batch_engine/_logging.py +292 -0
  100. azure/ai/evaluation/_legacy/_batch_engine/_openai_injector.py +23 -0
  101. azure/ai/evaluation/_legacy/_batch_engine/_result.py +99 -0
  102. azure/ai/evaluation/_legacy/_batch_engine/_run.py +121 -0
  103. azure/ai/evaluation/_legacy/_batch_engine/_run_storage.py +128 -0
  104. azure/ai/evaluation/_legacy/_batch_engine/_run_submitter.py +217 -0
  105. azure/ai/evaluation/_legacy/_batch_engine/_status.py +25 -0
  106. azure/ai/evaluation/_legacy/_batch_engine/_trace.py +105 -0
  107. azure/ai/evaluation/_legacy/_batch_engine/_utils.py +82 -0
  108. azure/ai/evaluation/_legacy/_batch_engine/_utils_deprecated.py +131 -0
  109. azure/ai/evaluation/_legacy/prompty/__init__.py +36 -0
  110. azure/ai/evaluation/_legacy/prompty/_connection.py +182 -0
  111. azure/ai/evaluation/_legacy/prompty/_exceptions.py +59 -0
  112. azure/ai/evaluation/_legacy/prompty/_prompty.py +313 -0
  113. azure/ai/evaluation/_legacy/prompty/_utils.py +545 -0
  114. azure/ai/evaluation/_legacy/prompty/_yaml_utils.py +99 -0
  115. azure/ai/evaluation/_safety_evaluation/__init__.py +1 -1
  116. azure/ai/evaluation/_safety_evaluation/_generated_rai_client.py +0 -0
  117. azure/ai/evaluation/_safety_evaluation/_safety_evaluation.py +251 -150
  118. azure/ai/evaluation/_version.py +1 -1
  119. azure/ai/evaluation/red_team/__init__.py +19 -0
  120. azure/ai/evaluation/red_team/_attack_objective_generator.py +195 -0
  121. azure/ai/evaluation/red_team/_attack_strategy.py +45 -0
  122. azure/ai/evaluation/red_team/_callback_chat_target.py +74 -0
  123. azure/ai/evaluation/red_team/_default_converter.py +21 -0
  124. azure/ai/evaluation/red_team/_red_team.py +1887 -0
  125. azure/ai/evaluation/red_team/_red_team_result.py +382 -0
  126. azure/ai/evaluation/red_team/_utils/__init__.py +3 -0
  127. azure/ai/evaluation/red_team/_utils/constants.py +65 -0
  128. azure/ai/evaluation/red_team/_utils/formatting_utils.py +165 -0
  129. azure/ai/evaluation/red_team/_utils/logging_utils.py +139 -0
  130. azure/ai/evaluation/red_team/_utils/strategy_utils.py +192 -0
  131. azure/ai/evaluation/simulator/_adversarial_scenario.py +3 -1
  132. azure/ai/evaluation/simulator/_adversarial_simulator.py +54 -27
  133. azure/ai/evaluation/simulator/_model_tools/_generated_rai_client.py +145 -0
  134. azure/ai/evaluation/simulator/_model_tools/_rai_client.py +71 -1
  135. azure/ai/evaluation/simulator/_simulator.py +1 -1
  136. {azure_ai_evaluation-1.3.0.dist-info → azure_ai_evaluation-1.5.0.dist-info}/METADATA +80 -15
  137. azure_ai_evaluation-1.5.0.dist-info/RECORD +207 -0
  138. {azure_ai_evaluation-1.3.0.dist-info → azure_ai_evaluation-1.5.0.dist-info}/WHEEL +1 -1
  139. azure/ai/evaluation/simulator/_tracing.py +0 -89
  140. azure_ai_evaluation-1.3.0.dist-info/RECORD +0 -119
  141. {azure_ai_evaluation-1.3.0.dist-info → azure_ai_evaluation-1.5.0.dist-info}/NOTICE.txt +0 -0
  142. {azure_ai_evaluation-1.3.0.dist-info → azure_ai_evaluation-1.5.0.dist-info}/top_level.txt +0 -0
@@ -8,15 +8,21 @@ import inspect
8
8
  import logging
9
9
  import math
10
10
  import os
11
+ from datetime import datetime
11
12
  from collections import OrderedDict
12
13
  from concurrent.futures import Future
13
- from typing import Any, Callable, Dict, Optional, Union
14
+ from typing import Any, Callable, Dict, Optional, Union, cast
14
15
 
16
+ from azure.ai.evaluation._legacy._adapters.entities import Run
17
+ from azure.ai.evaluation._legacy._adapters._configuration import Configuration
18
+ from azure.ai.evaluation._legacy._adapters.client import PFClient
19
+ from azure.ai.evaluation._legacy._adapters.tracing import ThreadPoolExecutorWithContext
15
20
  import pandas as pd
16
- from promptflow.client import PFClient
17
- from promptflow.entities import Run
18
- from promptflow.tracing import ThreadPoolExecutorWithContext as ThreadPoolExecutor
19
21
 
22
+ from azure.ai.evaluation._evaluate._batch_run.batch_clients import BatchClientRun, HasAsyncCallable
23
+
24
+
25
+ Configuration.get_instance().set_config("trace.destination", "none")
20
26
  LOGGER = logging.getLogger(__name__)
21
27
 
22
28
 
@@ -26,46 +32,56 @@ class ProxyRun:
26
32
 
27
33
 
28
34
  class ProxyClient: # pylint: disable=client-accepts-api-version-keyword
29
- def __init__( # pylint: disable=missing-client-constructor-parameter-credential,missing-client-constructor-parameter-kwargs
30
- self, pf_client: PFClient
35
+ def __init__( # pylint: disable=missing-client-constructor-parameter-credential
36
+ self,
37
+ **kwargs: Any,
31
38
  ) -> None:
32
- self._pf_client = pf_client
33
- self._thread_pool = ThreadPoolExecutor(thread_name_prefix="evaluators_thread")
39
+ self._pf_client = PFClient(**kwargs)
40
+ self._thread_pool = ThreadPoolExecutorWithContext(thread_name_prefix="evaluators_thread")
34
41
 
35
42
  def run(
36
43
  self,
37
- flow: Union[str, os.PathLike, Callable],
38
- data: Union[str, os.PathLike],
44
+ flow: Callable,
45
+ data: Union[str, os.PathLike, pd.DataFrame],
39
46
  column_mapping: Optional[Dict[str, str]] = None,
40
- **kwargs
47
+ evaluator_name: Optional[str] = None,
48
+ **kwargs: Any,
41
49
  ) -> ProxyRun:
42
- flow_to_run = flow
43
- if os.getenv("AI_EVALS_BATCH_USE_ASYNC", "true").lower() == "true" and hasattr(flow, "_to_async"):
50
+ if isinstance(data, pd.DataFrame):
51
+ raise ValueError("Data cannot be a pandas DataFrame")
52
+
53
+ flow_to_run: Callable = flow
54
+ if os.getenv("AI_EVALS_BATCH_USE_ASYNC", "true").lower() == "true" and isinstance(flow, HasAsyncCallable):
44
55
  flow_to_run = flow._to_async() # pylint: disable=protected-access
45
56
 
57
+ name: str = kwargs.pop("name", "")
58
+ if not name:
59
+ name = f"azure_ai_evaluation_evaluators_{evaluator_name}_{datetime.now().strftime('%Y%m%d_%H%M%S_%f')}"
60
+
46
61
  batch_use_async = self._should_batch_use_async(flow_to_run)
47
62
  eval_future = self._thread_pool.submit(
48
63
  self._pf_client.run,
49
64
  flow_to_run,
50
65
  data=data,
51
- column_mapping=column_mapping,
66
+ column_mapping=column_mapping, # type: ignore
52
67
  batch_use_async=batch_use_async,
53
- **kwargs
68
+ name=name,
69
+ **kwargs,
54
70
  )
55
71
  return ProxyRun(run=eval_future)
56
72
 
57
- def get_details(self, proxy_run: ProxyRun, all_results: bool = False) -> pd.DataFrame:
58
- run: Run = proxy_run.run.result()
73
+ def get_details(self, client_run: BatchClientRun, all_results: bool = False) -> pd.DataFrame:
74
+ run: Run = self.get_result(client_run)
59
75
  result_df = self._pf_client.get_details(run, all_results=all_results)
60
76
  result_df.replace("(Failed)", math.nan, inplace=True)
61
77
  return result_df
62
78
 
63
- def get_metrics(self, proxy_run: ProxyRun) -> Dict[str, Any]:
64
- run: Run = proxy_run.run.result()
79
+ def get_metrics(self, client_run: BatchClientRun) -> Dict[str, Any]:
80
+ run: Run = self.get_result(client_run)
65
81
  return self._pf_client.get_metrics(run)
66
82
 
67
- def get_run_summary(self, proxy_run: ProxyRun) -> Dict[str, Any]:
68
- run = proxy_run.run.result()
83
+ def get_run_summary(self, client_run: BatchClientRun) -> Dict[str, Any]:
84
+ run: Run = self.get_result(client_run)
69
85
 
70
86
  # pylint: disable=protected-access
71
87
  completed_lines = run._properties.get("system_metrics", {}).get("__pf__.lines.completed", "NA")
@@ -81,13 +97,17 @@ class ProxyClient: # pylint: disable=client-accepts-api-version-keyword
81
97
  return OrderedDict(
82
98
  [
83
99
  ("status", status),
84
- ("duration", str(run._end_time - run._created_on)),
100
+ ("duration", str((run._end_time or run._created_on) - run._created_on)),
85
101
  ("completed_lines", completed_lines),
86
102
  ("failed_lines", failed_lines),
87
103
  ("log_path", str(run._output_path)),
88
104
  ]
89
105
  )
90
106
 
107
+ @staticmethod
108
+ def get_result(run: BatchClientRun) -> Run:
109
+ return cast(ProxyRun, run).run.result()
110
+
91
111
  @staticmethod
92
112
  def _should_batch_use_async(flow):
93
113
  if os.getenv("AI_EVALS_BATCH_USE_ASYNC", "true").lower() == "true":
@@ -5,7 +5,7 @@ import os
5
5
  import types
6
6
  from typing import Optional, Type
7
7
 
8
- from promptflow._sdk._constants import PF_FLOW_ENTRY_IN_TMP
8
+ from azure.ai.evaluation._legacy._adapters._constants import PF_FLOW_ENTRY_IN_TMP
9
9
  from azure.ai.evaluation._constants import PF_DISABLE_TRACING
10
10
 
11
11
 
@@ -13,7 +13,7 @@ import uuid
13
13
  from typing import Any, Dict, List, Optional, Set, Type
14
14
  from urllib.parse import urlparse
15
15
 
16
- from promptflow._sdk.entities import Run
16
+ from azure.ai.evaluation._legacy._adapters.entities import Run
17
17
  from typing_extensions import Self
18
18
 
19
19
  from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
@@ -404,7 +404,7 @@ class EvalRun(contextlib.AbstractContextManager): # pylint: disable=too-many-in
404
404
  LOGGER.warning("The run results file was not found, skipping artifacts upload.")
405
405
  return
406
406
  # First we will list the files and the appropriate remote paths for them.
407
- root_upload_path = posixpath.join("promptflow", "PromptFlowArtifacts", self.info.run_name)
407
+ root_upload_path = posixpath.join("promptflow", "PromptFlowArtifacts", self.info.run_id)
408
408
  remote_paths: Dict[str, List[Dict[str, str]]] = {"paths": []}
409
409
  local_paths = []
410
410
  # Go over the artifact folder and upload all artifacts.
@@ -6,13 +6,11 @@ import json
6
6
  import logging
7
7
  import os
8
8
  import re
9
- from typing import Any, Callable, Dict, List, Optional, Set, Tuple, TypedDict, TypeVar, Union
9
+ from typing import Any, Callable, Dict, List, Optional, Set, Tuple, TypedDict, Union, cast
10
10
 
11
+ from azure.ai.evaluation._legacy._adapters._constants import LINE_NUMBER
12
+ from azure.ai.evaluation._legacy._adapters.entities import Run
11
13
  import pandas as pd
12
- from promptflow._sdk._constants import LINE_NUMBER
13
- from promptflow.client import PFClient
14
- from promptflow.entities import Run
15
- from promptflow._sdk._configuration import Configuration
16
14
 
17
15
  from azure.ai.evaluation._common.math import list_mean_nan_safe, apply_transform_nan_safe
18
16
  from azure.ai.evaluation._common.utils import validate_azure_ai_project
@@ -27,7 +25,14 @@ from .._constants import (
27
25
  )
28
26
  from .._model_configurations import AzureAIProject, EvaluationResult, EvaluatorConfig
29
27
  from .._user_agent import USER_AGENT
30
- from ._batch_run import EvalRunContext, CodeClient, ProxyClient, TargetRunContext, ProxyRun
28
+ from ._batch_run import (
29
+ EvalRunContext,
30
+ CodeClient,
31
+ ProxyClient,
32
+ ProxyRun,
33
+ TargetRunContext,
34
+ RunSubmitterClient,
35
+ )
31
36
  from ._utils import (
32
37
  _apply_column_mapping,
33
38
  _log_metrics_and_instance_results,
@@ -35,8 +40,8 @@ from ._utils import (
35
40
  _write_output,
36
41
  DataLoaderFactory,
37
42
  )
43
+ from ._batch_run.batch_clients import BatchClient
38
44
 
39
- TClient = TypeVar("TClient", ProxyClient, CodeClient)
40
45
  LOGGER = logging.getLogger(__name__)
41
46
 
42
47
  # For metrics (aggregates) whose metric names intentionally differ from their
@@ -71,7 +76,7 @@ def _aggregate_other_metrics(df: pd.DataFrame) -> Tuple[List[str], Dict[str, flo
71
76
  if metric_name in METRIC_COLUMN_NAME_REPLACEMENTS:
72
77
  renamed_cols.append(col)
73
78
  new_col_name = metric_prefix + "." + METRIC_COLUMN_NAME_REPLACEMENTS[metric_name]
74
- col_with_numeric_values = pd.to_numeric(df[col], errors="coerce")
79
+ col_with_numeric_values = cast(List[float], pd.to_numeric(df[col], errors="coerce"))
75
80
  try:
76
81
  metric_columns[new_col_name] = round(list_mean_nan_safe(col_with_numeric_values), 2)
77
82
  except EvaluationException: # only exception that can be cause is all NaN values
@@ -122,7 +127,7 @@ def _aggregate_content_safety_metrics(
122
127
  defect_rates = {}
123
128
  for col in content_safety_df.columns:
124
129
  defect_rate_name = col.replace("_score", "_defect_rate")
125
- col_with_numeric_values = pd.to_numeric(content_safety_df[col], errors="coerce")
130
+ col_with_numeric_values = cast(List[float], pd.to_numeric(content_safety_df[col], errors="coerce"))
126
131
  try:
127
132
  col_with_boolean_values = apply_transform_nan_safe(
128
133
  col_with_numeric_values, lambda x: 1 if x >= CONTENT_SAFETY_DEFECT_RATE_THRESHOLD_DEFAULT else 0
@@ -152,26 +157,57 @@ def _aggregate_label_defect_metrics(df: pd.DataFrame) -> Tuple[List[str], Dict[s
152
157
  EvaluationMetrics.LOGOS_AND_BRANDS,
153
158
  _InternalEvaluationMetrics.ECI,
154
159
  EvaluationMetrics.XPIA,
160
+ EvaluationMetrics.CODE_VULNERABILITY,
161
+ EvaluationMetrics.UNGROUNDED_ATTRIBUTES,
155
162
  ]
156
163
  label_cols = []
164
+ details_cols = []
157
165
  for col in df.columns:
158
166
  metric_name = col.split(".")[1]
159
167
  if metric_name.endswith("_label") and metric_name.replace("_label", "").lower() in handled_metrics:
160
168
  label_cols.append(col)
169
+ if metric_name.endswith("_details") and metric_name.replace("_details", "").lower() in handled_metrics:
170
+ details_cols = col
161
171
 
162
172
  label_df = df[label_cols]
163
173
  defect_rates = {}
164
174
  for col in label_df.columns:
165
175
  defect_rate_name = col.replace("_label", "_defect_rate")
166
- col_with_boolean_values = pd.to_numeric(label_df[col], errors="coerce")
176
+ col_with_boolean_values = cast(List[float], pd.to_numeric(label_df[col], errors="coerce"))
167
177
  try:
168
178
  defect_rates[defect_rate_name] = round(list_mean_nan_safe(col_with_boolean_values), 2)
169
179
  except EvaluationException: # only exception that can be cause is all NaN values
170
180
  msg = f"All score evaluations are NaN/None for column {col}. No aggregation can be performed."
171
181
  LOGGER.warning(msg)
182
+
183
+ if details_cols:
184
+ details_df = df[details_cols]
185
+ detail_defect_rates = {}
186
+
187
+ for key, value in details_df.items():
188
+ _process_rows(value, detail_defect_rates)
189
+
190
+ for key, value in detail_defect_rates.items():
191
+ col_with_boolean_values = pd.to_numeric(value, errors="coerce")
192
+ try:
193
+ defect_rates[f"{details_cols}.{key}_defect_rate"] = round(
194
+ list_mean_nan_safe(col_with_boolean_values), 2
195
+ )
196
+ except EvaluationException: # only exception that can be cause is all NaN values
197
+ msg = f"All score evaluations are NaN/None for column {key}. No aggregation can be performed."
198
+ LOGGER.warning(msg)
199
+
172
200
  return label_cols, defect_rates
173
201
 
174
202
 
203
+ def _process_rows(row, detail_defect_rates):
204
+ for key, value in row.items():
205
+ if key not in detail_defect_rates:
206
+ detail_defect_rates[key] = []
207
+ detail_defect_rates[key].append(value)
208
+ return detail_defect_rates
209
+
210
+
175
211
  def _aggregate_metrics(df: pd.DataFrame, evaluators: Dict[str, Callable]) -> Dict[str, float]:
176
212
  """Aggregate metrics from the evaluation results.
177
213
  On top of naively calculating the mean of most metrics, this function also identifies certain columns
@@ -303,7 +339,7 @@ def _validate_columns_for_evaluators(
303
339
  missing_inputs = []
304
340
  else:
305
341
  optional_params = (
306
- evaluator._OPTIONAL_PARAMS # pylint: disable=protected-access
342
+ cast(Any, evaluator)._OPTIONAL_PARAMS # pylint: disable=protected-access
307
343
  if hasattr(evaluator, "_OPTIONAL_PARAMS")
308
344
  else []
309
345
  )
@@ -451,7 +487,7 @@ def _validate_and_load_data(target, data, evaluators, output_path, azure_ai_proj
451
487
  def _apply_target_to_data(
452
488
  target: Callable,
453
489
  data: Union[str, os.PathLike],
454
- batch_client: TClient,
490
+ batch_client: BatchClient,
455
491
  initial_data: pd.DataFrame,
456
492
  evaluation_name: Optional[str] = None,
457
493
  **kwargs,
@@ -472,22 +508,31 @@ def _apply_target_to_data(
472
508
  :return: The tuple, containing data frame and the list of added columns.
473
509
  :rtype: Tuple[pandas.DataFrame, List[str]]
474
510
  """
511
+
512
+ if not isinstance(batch_client, ProxyClient):
513
+ raise ValueError("Only ProxyClient supports target runs for now.")
514
+
475
515
  _run_name = kwargs.get("_run_name")
476
516
  with TargetRunContext():
477
- run: ProxyRun = batch_client.run(
478
- flow=target,
479
- display_name=evaluation_name,
480
- data=data,
481
- stream=True,
482
- name=_run_name,
517
+ run = cast(
518
+ ProxyRun,
519
+ batch_client.run(
520
+ flow=target,
521
+ display_name=evaluation_name,
522
+ data=data,
523
+ stream=True,
524
+ name=_run_name,
525
+ ),
483
526
  )
484
527
 
485
528
  target_output: pd.DataFrame = batch_client.get_details(run, all_results=True)
486
529
  run_summary = batch_client.get_run_summary(run)
487
530
 
488
531
  if run_summary["completed_lines"] == 0:
489
- msg = (f"Evaluation target failed to produce any results."
490
- f" Please check the logs at {run_summary['log_path']} for more details about cause of failure.")
532
+ msg = (
533
+ f"Evaluation target failed to produce any results."
534
+ f" Please check the logs at {run_summary['log_path']} for more details about cause of failure."
535
+ )
491
536
  raise EvaluationException(
492
537
  message=msg,
493
538
  target=ErrorTarget.EVALUATE,
@@ -577,7 +622,6 @@ def _rename_columns_conditionally(df: pd.DataFrame) -> pd.DataFrame:
577
622
  return df
578
623
 
579
624
 
580
- # @log_evaluate_activity
581
625
  def evaluate(
582
626
  *,
583
627
  data: Union[str, os.PathLike],
@@ -728,20 +772,24 @@ def _evaluate( # pylint: disable=too-many-locals,too-many-statements
728
772
  if target is not None:
729
773
  _validate_columns_for_target(input_data_df, target)
730
774
 
731
- Configuration.get_instance().set_config("trace.destination", "none")
732
- pf_client = PFClient(user_agent=USER_AGENT)
733
- target_run: Optional[Run] = None
734
-
735
775
  # Create default configuration for evaluators that directly maps
736
776
  # input data names to keyword inputs of the same name in the evaluators.
737
777
  column_mapping = column_mapping or {}
738
778
  column_mapping.setdefault("default", {})
739
779
 
740
- # If target is set, apply 1-1 column mapping from target outputs to evaluator inputs
780
+ target_run: Optional[Run] = None
741
781
  target_generated_columns: Set[str] = set()
782
+ batch_run_client: BatchClient
783
+ batch_run_data: Union[str, os.PathLike, pd.DataFrame] = data
784
+
785
+ # If target is set, apply 1-1 column mapping from target outputs to evaluator inputs
742
786
  if data is not None and target is not None:
787
+ # Right now, only the ProxyClient that uses Promptflow supports a target function
788
+ batch_run_client = ProxyClient(user_agent=USER_AGENT)
789
+ batch_run_data = os.path.abspath(data)
790
+
743
791
  input_data_df, target_generated_columns, target_run = _apply_target_to_data(
744
- target, data, ProxyClient(pf_client), input_data_df, evaluation_name, **kwargs
792
+ target, data, batch_run_client, input_data_df, evaluation_name, **kwargs
745
793
  )
746
794
 
747
795
  for evaluator_name, mapping in column_mapping.items():
@@ -755,6 +803,17 @@ def _evaluate( # pylint: disable=too-many-locals,too-many-statements
755
803
  # customer did not mapped target output.
756
804
  if col not in mapping and run_output not in mapped_to_values:
757
805
  column_mapping[evaluator_name][col] = run_output # pylint: disable=unnecessary-dict-index-lookup
806
+ elif kwargs.pop("_use_run_submitter_client", False):
807
+ batch_run_client = RunSubmitterClient()
808
+ batch_run_data = input_data_df
809
+ elif kwargs.pop("_use_pf_client", True):
810
+ batch_run_client = ProxyClient(user_agent=USER_AGENT)
811
+ # Ensure the absolute path is passed to pf.run, as relative path doesn't work with
812
+ # multiple evaluators. If the path is already absolute, abspath will return the original path.
813
+ batch_run_data = os.path.abspath(data)
814
+ else:
815
+ batch_run_client = CodeClient()
816
+ batch_run_data = input_data_df
758
817
 
759
818
  # After we have generated all columns, we can check if we have everything we need for evaluators.
760
819
  _validate_columns_for_evaluators(input_data_df, evaluators, target, target_generated_columns, column_mapping)
@@ -770,46 +829,32 @@ def _evaluate( # pylint: disable=too-many-locals,too-many-statements
770
829
  if not col.startswith(Prefixes.TSG_OUTPUTS) and col not in column_mapping["default"].keys():
771
830
  column_mapping["default"][col] = f"${{data.{col}}}"
772
831
 
773
- def eval_batch_run(
774
- batch_run_client: TClient, *, data=Union[str, os.PathLike, pd.DataFrame]
775
- ) -> Dict[str, __EvaluatorInfo]:
776
- with EvalRunContext(batch_run_client):
777
- runs = {
778
- evaluator_name: batch_run_client.run(
779
- flow=evaluator,
780
- run=target_run,
781
- evaluator_name=evaluator_name,
782
- column_mapping=column_mapping.get(evaluator_name, column_mapping.get("default", None)),
783
- data=data,
784
- stream=True,
785
- name=kwargs.get("_run_name"),
786
- )
787
- for evaluator_name, evaluator in evaluators.items()
788
- }
832
+ with EvalRunContext(batch_run_client):
833
+ runs = {
834
+ evaluator_name: batch_run_client.run(
835
+ flow=evaluator,
836
+ data=batch_run_data,
837
+ run=target_run,
838
+ evaluator_name=evaluator_name,
839
+ column_mapping=column_mapping.get(evaluator_name, column_mapping.get("default", None)),
840
+ stream=True,
841
+ name=kwargs.get("_run_name"),
842
+ )
843
+ for evaluator_name, evaluator in evaluators.items()
844
+ }
789
845
 
790
- # get_details needs to be called within EvalRunContext scope in order to have user agent populated
791
- return {
792
- evaluator_name: {
793
- "result": batch_run_client.get_details(run, all_results=True),
794
- "metrics": batch_run_client.get_metrics(run),
795
- "run_summary": batch_run_client.get_run_summary(run),
796
- }
797
- for evaluator_name, run in runs.items()
846
+ # get_details needs to be called within EvalRunContext scope in order to have user agent populated
847
+ per_evaluator_results: Dict[str, __EvaluatorInfo] = {
848
+ evaluator_name: {
849
+ "result": batch_run_client.get_details(run, all_results=True),
850
+ "metrics": batch_run_client.get_metrics(run),
851
+ "run_summary": batch_run_client.get_run_summary(run),
798
852
  }
799
-
800
- # Batch Run
801
- use_pf_client = kwargs.get("_use_pf_client", True)
802
- if use_pf_client:
803
- # Ensure the absolute path is passed to pf.run, as relative path doesn't work with
804
- # multiple evaluators. If the path is already absolute, abspath will return the original path.
805
- data = os.path.abspath(data)
806
- per_evaluator_results = eval_batch_run(ProxyClient(pf_client), data=data)
807
- else:
808
- data = input_data_df
809
- per_evaluator_results = eval_batch_run(CodeClient(), data=input_data_df)
853
+ for evaluator_name, run in runs.items()
854
+ }
810
855
 
811
856
  # Concatenate all results
812
- evaluators_result_df = None
857
+ evaluators_result_df = pd.DataFrame()
813
858
  evaluators_metric = {}
814
859
  for evaluator_name, evaluator_result in per_evaluator_results.items():
815
860
  if fail_on_evaluator_errors and evaluator_result["run_summary"]["failed_lines"] > 0:
@@ -851,7 +896,7 @@ def _evaluate( # pylint: disable=too-many-locals,too-many-statements
851
896
  metrics.update(evaluators_metric)
852
897
 
853
898
  # Since tracing is disabled, pass None for target_run so a dummy evaluation run will be created each time.
854
- target_run = None
899
+ target_run: Optional[Run] = None
855
900
  trace_destination = _trace_destination_from_project_scope(azure_ai_project) if azure_ai_project else None
856
901
  studio_url = None
857
902
  if trace_destination:
@@ -9,11 +9,10 @@ import logging
9
9
  from typing import Callable, Dict, Literal, Optional, Union, cast
10
10
 
11
11
  import pandas as pd
12
- from promptflow._sdk.entities._flows import FlexFlow as flex_flow
13
- from promptflow._sdk.entities._flows import Prompty as prompty_sdk
14
- from promptflow._sdk.entities._flows.dag import Flow as dag_flow
15
- from promptflow.client import PFClient
16
- from promptflow.core import Prompty as prompty_core
12
+ from azure.ai.evaluation._legacy._adapters._flows import FlexFlow as flex_flow
13
+ from azure.ai.evaluation._legacy._adapters._flows import AsyncPrompty as prompty_sdk
14
+ from azure.ai.evaluation._legacy._adapters._flows import Flow as dag_flow
15
+ from azure.ai.evaluation._legacy._adapters.client import PFClient
17
16
  from typing_extensions import ParamSpec
18
17
 
19
18
  from azure.ai.evaluation._model_configurations import AzureAIProject, EvaluationResult
@@ -66,7 +65,7 @@ def _get_evaluator_properties(evaluator, evaluator_name):
66
65
 
67
66
  try:
68
67
  # Cover flex flow and prompty based evaluator
69
- if isinstance(evaluator, (prompty_sdk, prompty_core, flex_flow)):
68
+ if isinstance(evaluator, (prompty_sdk, flex_flow)):
70
69
  name = evaluator.name
71
70
  pf_type = evaluator.__class__.__name__
72
71
  # Cover dag flow based evaluator
@@ -94,86 +93,3 @@ def _get_evaluator_properties(evaluator, evaluator_name):
94
93
  "type": _get_evaluator_type(evaluator),
95
94
  "alias": evaluator_name if evaluator_name else "",
96
95
  }
97
-
98
-
99
- # cspell:ignore isna
100
- def log_evaluate_activity(func: Callable[P, EvaluationResult]) -> Callable[P, EvaluationResult]:
101
- """Decorator to log evaluate activity
102
-
103
- :param func: The function to be decorated
104
- :type func: Callable
105
- :returns: The decorated function
106
- :rtype: Callable[P, EvaluationResult]
107
- """
108
-
109
- @functools.wraps(func)
110
- def wrapper(*args: P.args, **kwargs: P.kwargs) -> EvaluationResult:
111
- from promptflow._sdk._telemetry import ActivityType, log_activity
112
- from promptflow._sdk._telemetry.telemetry import get_telemetry_logger
113
-
114
- evaluators = cast(Optional[Dict[str, Callable]], kwargs.get("evaluators", {})) or {}
115
- azure_ai_project = cast(Optional[AzureAIProject], kwargs.get("azure_ai_project", None))
116
-
117
- pf_client = PFClient(
118
- config=(
119
- {"trace.destination": _trace_destination_from_project_scope(azure_ai_project)}
120
- if azure_ai_project
121
- else None
122
- ),
123
- user_agent=USER_AGENT,
124
- )
125
-
126
- trace_destination = pf_client._config.get_trace_destination() # pylint: disable=protected-access
127
- track_in_cloud = bool(trace_destination) if trace_destination != "none" else False
128
- evaluate_target = bool(kwargs.get("target", None))
129
- evaluator_config = bool(kwargs.get("evaluator_config", None))
130
- custom_dimensions: Dict[str, Union[str, bool]] = {
131
- "track_in_cloud": track_in_cloud,
132
- "evaluate_target": evaluate_target,
133
- "evaluator_config": evaluator_config,
134
- }
135
-
136
- with log_activity(
137
- get_telemetry_logger(),
138
- "pf.evals.evaluate",
139
- activity_type=ActivityType.PUBLICAPI,
140
- user_agent=USER_AGENT,
141
- custom_dimensions=custom_dimensions,
142
- ):
143
- result = func(*args, **kwargs)
144
-
145
- try:
146
- evaluators_info = []
147
- for evaluator_name, evaluator in evaluators.items():
148
- evaluator_info = _get_evaluator_properties(evaluator, evaluator_name)
149
- try:
150
- evaluator_df = pd.DataFrame(result.get("rows", [])).filter(
151
- like=f"outputs.{evaluator_name}", axis=1
152
- )
153
-
154
- failed_rows = (
155
- evaluator_df.shape[0] if evaluator_df.empty else int(evaluator_df.isna().any(axis=1).sum())
156
- )
157
- total_rows = evaluator_df.shape[0]
158
-
159
- evaluator_info["failed_rows"] = failed_rows
160
- evaluator_info["total_rows"] = total_rows
161
- except Exception as e: # pylint: disable=broad-exception-caught
162
- LOGGER.debug("Failed to collect evaluate failed row info for %s: %s", evaluator_name, e)
163
- evaluators_info.append(evaluator_info)
164
-
165
- custom_dimensions = {"evaluators_info": json.dumps(evaluators_info)}
166
- with log_activity(
167
- get_telemetry_logger(),
168
- "pf.evals.evaluate_usage_info",
169
- activity_type=ActivityType.PUBLICAPI,
170
- user_agent=USER_AGENT,
171
- custom_dimensions=custom_dimensions,
172
- ):
173
- pass
174
- except Exception as e: # pylint: disable=broad-exception-caught
175
- LOGGER.debug("Failed to collect evaluate usage info: %s", e)
176
-
177
- return result
178
-
179
- return wrapper
@@ -12,7 +12,7 @@ import uuid
12
12
  import base64
13
13
 
14
14
  import pandas as pd
15
- from promptflow.entities import Run
15
+ from azure.ai.evaluation._legacy._adapters.entities import Run
16
16
 
17
17
  from azure.ai.evaluation._constants import (
18
18
  DEFAULT_EVALUATION_RESULTS_FILE_NAME,
@@ -46,7 +46,7 @@ def is_none(value) -> bool:
46
46
  def extract_workspace_triad_from_trace_provider( # pylint: disable=name-too-long
47
47
  trace_provider: str,
48
48
  ) -> AzureMLWorkspace:
49
- from promptflow._cli._utils import get_workspace_triad_from_local
49
+ from azure.ai.evaluation._legacy._adapters.utils import get_workspace_triad_from_local
50
50
 
51
51
  match = re.match(AZURE_WORKSPACE_REGEX_FORMAT, trace_provider)
52
52
  if not match or len(match.groups()) != 5:
@@ -131,7 +131,7 @@ def _log_metrics_and_instance_results(
131
131
  metrics: Dict[str, Any],
132
132
  instance_results: pd.DataFrame,
133
133
  trace_destination: Optional[str],
134
- run: Run,
134
+ run: Optional[Run],
135
135
  evaluation_name: Optional[str],
136
136
  **kwargs,
137
137
  ) -> Optional[str]:
@@ -8,6 +8,7 @@ from typing_extensions import overload, override
8
8
  from azure.ai.evaluation._common.utils import nltk_tokenize
9
9
 
10
10
  from azure.ai.evaluation._evaluators._common import EvaluatorBase
11
+ from azure.ai.evaluation._constants import EVALUATION_PASS_FAIL_MAPPING
11
12
 
12
13
 
13
14
  class BleuScoreEvaluator(EvaluatorBase):
@@ -22,6 +23,8 @@ class BleuScoreEvaluator(EvaluatorBase):
22
23
  indicator of quality.
23
24
 
24
25
  The BLEU score ranges from 0 to 1, with higher scores indicating better quality.
26
+ :param threshold: The threshold for the evaluation. Default is 0.5.
27
+ :type threshold: float
25
28
 
26
29
  .. admonition:: Example:
27
30
 
@@ -31,17 +34,27 @@ class BleuScoreEvaluator(EvaluatorBase):
31
34
  :language: python
32
35
  :dedent: 8
33
36
  :caption: Initialize and call an BleuScoreEvaluator.
37
+
38
+ .. admonition:: Example with Threshold:
39
+ .. literalinclude:: ../samples/evaluation_samples_threshold.py
40
+ :start-after: [START threshold_bleu_score_evaluator]
41
+ :end-before: [END threshold_bleu_score_evaluator]
42
+ :language: python
43
+ :dedent: 8
44
+ :caption: Initialize with threshold and call an BleuScoreEvaluator.
34
45
  """
35
46
 
36
47
  id = "azureml://registries/azureml/models/Bleu-Score-Evaluator/versions/3"
37
48
  """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
38
49
 
39
- def __init__(self):
40
- super().__init__()
50
+ def __init__(self, *, threshold=0.5):
51
+ self._threshold = threshold
52
+ self._higher_is_better = True
53
+ super().__init__(threshold=threshold, _higher_is_better=self._higher_is_better)
41
54
 
42
55
  @override
43
56
  async def _do_eval(self, eval_input: Dict) -> Dict[str, float]:
44
- """Produce a glue score evaluation result.
57
+ """Produce a bleu score evaluation result.
45
58
 
46
59
  :param eval_input: The input to the evaluation function.
47
60
  :type eval_input: Dict
@@ -56,9 +69,16 @@ class BleuScoreEvaluator(EvaluatorBase):
56
69
  # NIST Smoothing
57
70
  smoothing_function = SmoothingFunction().method4
58
71
  score = sentence_bleu([reference_tokens], hypothesis_tokens, smoothing_function=smoothing_function)
72
+ binary_result = False
73
+ if self._higher_is_better:
74
+ binary_result = score >= self._threshold
75
+ else:
76
+ binary_result = score <= self._threshold
59
77
 
60
78
  return {
61
79
  "bleu_score": score,
80
+ "bleu_result": EVALUATION_PASS_FAIL_MAPPING[binary_result],
81
+ "bleu_threshold": self._threshold,
62
82
  }
63
83
 
64
84
  @overload # type: ignore
@@ -0,0 +1,5 @@
1
+ from ._code_vulnerability import CodeVulnerabilityEvaluator
2
+
3
+ __all__ = [
4
+ "CodeVulnerabilityEvaluator",
5
+ ]