azure-ai-evaluation 1.0.0__py3-none-any.whl → 1.0.0b1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of azure-ai-evaluation might be problematic. Click here for more details.

Files changed (108) hide show
  1. azure/ai/evaluation/__init__.py +4 -26
  2. azure/ai/evaluation/_common/constants.py +2 -9
  3. azure/ai/evaluation/_common/rai_service.py +122 -302
  4. azure/ai/evaluation/_common/utils.py +35 -393
  5. azure/ai/evaluation/_constants.py +6 -28
  6. azure/ai/evaluation/_evaluate/{_batch_run → _batch_run_client}/__init__.py +2 -3
  7. azure/ai/evaluation/_evaluate/{_batch_run/eval_run_context.py → _batch_run_client/batch_run_context.py} +8 -25
  8. azure/ai/evaluation/_evaluate/{_batch_run → _batch_run_client}/code_client.py +30 -68
  9. azure/ai/evaluation/_evaluate/_batch_run_client/proxy_client.py +61 -0
  10. azure/ai/evaluation/_evaluate/_eval_run.py +40 -117
  11. azure/ai/evaluation/_evaluate/_evaluate.py +255 -416
  12. azure/ai/evaluation/_evaluate/_telemetry/__init__.py +19 -24
  13. azure/ai/evaluation/_evaluate/_utils.py +47 -108
  14. azure/ai/evaluation/_evaluators/_bleu/_bleu.py +19 -18
  15. azure/ai/evaluation/_evaluators/{_retrieval → _chat}/__init__.py +2 -2
  16. azure/ai/evaluation/_evaluators/_chat/_chat.py +350 -0
  17. azure/ai/evaluation/_evaluators/{_service_groundedness → _chat/retrieval}/__init__.py +2 -2
  18. azure/ai/evaluation/_evaluators/_chat/retrieval/_retrieval.py +163 -0
  19. azure/ai/evaluation/_evaluators/_chat/retrieval/retrieval.prompty +48 -0
  20. azure/ai/evaluation/_evaluators/_coherence/_coherence.py +93 -78
  21. azure/ai/evaluation/_evaluators/_coherence/coherence.prompty +39 -76
  22. azure/ai/evaluation/_evaluators/_content_safety/__init__.py +4 -0
  23. azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py +68 -104
  24. azure/ai/evaluation/_evaluators/{_multimodal/_content_safety_multimodal_base.py → _content_safety/_content_safety_base.py} +35 -24
  25. azure/ai/evaluation/_evaluators/_content_safety/_content_safety_chat.py +296 -0
  26. azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py +54 -105
  27. azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py +52 -99
  28. azure/ai/evaluation/_evaluators/_content_safety/_sexual.py +52 -101
  29. azure/ai/evaluation/_evaluators/_content_safety/_violence.py +51 -101
  30. azure/ai/evaluation/_evaluators/_eci/_eci.py +55 -45
  31. azure/ai/evaluation/_evaluators/_f1_score/_f1_score.py +20 -36
  32. azure/ai/evaluation/_evaluators/_fluency/_fluency.py +94 -76
  33. azure/ai/evaluation/_evaluators/_fluency/fluency.prompty +41 -66
  34. azure/ai/evaluation/_evaluators/_gleu/_gleu.py +17 -15
  35. azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +92 -113
  36. azure/ai/evaluation/_evaluators/_groundedness/groundedness.prompty +54 -0
  37. azure/ai/evaluation/_evaluators/_meteor/_meteor.py +27 -21
  38. azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py +80 -89
  39. azure/ai/evaluation/_evaluators/_protected_materials/__init__.py +5 -0
  40. azure/ai/evaluation/_evaluators/_protected_materials/_protected_materials.py +104 -0
  41. azure/ai/evaluation/_evaluators/_qa/_qa.py +43 -25
  42. azure/ai/evaluation/_evaluators/_relevance/_relevance.py +101 -84
  43. azure/ai/evaluation/_evaluators/_relevance/relevance.prompty +47 -78
  44. azure/ai/evaluation/_evaluators/_rouge/_rouge.py +27 -27
  45. azure/ai/evaluation/_evaluators/_similarity/_similarity.py +45 -55
  46. azure/ai/evaluation/_evaluators/_similarity/similarity.prompty +5 -0
  47. azure/ai/evaluation/_evaluators/_xpia/xpia.py +106 -91
  48. azure/ai/evaluation/_exceptions.py +7 -28
  49. azure/ai/evaluation/_http_utils.py +134 -205
  50. azure/ai/evaluation/_model_configurations.py +8 -104
  51. azure/ai/evaluation/_version.py +1 -1
  52. azure/ai/evaluation/simulator/__init__.py +2 -3
  53. azure/ai/evaluation/simulator/_adversarial_scenario.py +1 -20
  54. azure/ai/evaluation/simulator/_adversarial_simulator.py +95 -116
  55. azure/ai/evaluation/simulator/_constants.py +1 -11
  56. azure/ai/evaluation/simulator/_conversation/__init__.py +13 -14
  57. azure/ai/evaluation/simulator/_conversation/_conversation.py +20 -20
  58. azure/ai/evaluation/simulator/_direct_attack_simulator.py +68 -34
  59. azure/ai/evaluation/simulator/_helpers/__init__.py +1 -1
  60. azure/ai/evaluation/simulator/_helpers/_simulator_data_classes.py +28 -31
  61. azure/ai/evaluation/simulator/_indirect_attack_simulator.py +95 -108
  62. azure/ai/evaluation/simulator/_model_tools/_identity_manager.py +22 -70
  63. azure/ai/evaluation/simulator/_model_tools/_proxy_completion_model.py +14 -30
  64. azure/ai/evaluation/simulator/_model_tools/_rai_client.py +14 -25
  65. azure/ai/evaluation/simulator/_model_tools/_template_handler.py +24 -68
  66. azure/ai/evaluation/simulator/_model_tools/models.py +21 -19
  67. azure/ai/evaluation/simulator/_prompty/task_query_response.prompty +10 -6
  68. azure/ai/evaluation/simulator/_prompty/task_simulate.prompty +5 -6
  69. azure/ai/evaluation/simulator/_tracing.py +28 -25
  70. azure/ai/evaluation/simulator/_utils.py +13 -34
  71. azure/ai/evaluation/simulator/simulator.py +579 -0
  72. azure_ai_evaluation-1.0.0b1.dist-info/METADATA +377 -0
  73. azure_ai_evaluation-1.0.0b1.dist-info/RECORD +97 -0
  74. {azure_ai_evaluation-1.0.0.dist-info → azure_ai_evaluation-1.0.0b1.dist-info}/WHEEL +1 -1
  75. azure/ai/evaluation/_common/_experimental.py +0 -172
  76. azure/ai/evaluation/_common/math.py +0 -89
  77. azure/ai/evaluation/_evaluate/_batch_run/proxy_client.py +0 -99
  78. azure/ai/evaluation/_evaluate/_batch_run/target_run_context.py +0 -46
  79. azure/ai/evaluation/_evaluators/_common/__init__.py +0 -13
  80. azure/ai/evaluation/_evaluators/_common/_base_eval.py +0 -344
  81. azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +0 -88
  82. azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py +0 -133
  83. azure/ai/evaluation/_evaluators/_groundedness/groundedness_with_query.prompty +0 -113
  84. azure/ai/evaluation/_evaluators/_groundedness/groundedness_without_query.prompty +0 -99
  85. azure/ai/evaluation/_evaluators/_multimodal/__init__.py +0 -20
  86. azure/ai/evaluation/_evaluators/_multimodal/_content_safety_multimodal.py +0 -132
  87. azure/ai/evaluation/_evaluators/_multimodal/_hate_unfairness.py +0 -100
  88. azure/ai/evaluation/_evaluators/_multimodal/_protected_material.py +0 -124
  89. azure/ai/evaluation/_evaluators/_multimodal/_self_harm.py +0 -100
  90. azure/ai/evaluation/_evaluators/_multimodal/_sexual.py +0 -100
  91. azure/ai/evaluation/_evaluators/_multimodal/_violence.py +0 -100
  92. azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py +0 -112
  93. azure/ai/evaluation/_evaluators/_retrieval/retrieval.prompty +0 -93
  94. azure/ai/evaluation/_evaluators/_service_groundedness/_service_groundedness.py +0 -148
  95. azure/ai/evaluation/_vendor/__init__.py +0 -3
  96. azure/ai/evaluation/_vendor/rouge_score/__init__.py +0 -14
  97. azure/ai/evaluation/_vendor/rouge_score/rouge_scorer.py +0 -328
  98. azure/ai/evaluation/_vendor/rouge_score/scoring.py +0 -63
  99. azure/ai/evaluation/_vendor/rouge_score/tokenize.py +0 -63
  100. azure/ai/evaluation/_vendor/rouge_score/tokenizers.py +0 -53
  101. azure/ai/evaluation/simulator/_data_sources/__init__.py +0 -3
  102. azure/ai/evaluation/simulator/_data_sources/grounding.json +0 -1150
  103. azure/ai/evaluation/simulator/_prompty/__init__.py +0 -0
  104. azure/ai/evaluation/simulator/_simulator.py +0 -716
  105. azure_ai_evaluation-1.0.0.dist-info/METADATA +0 -595
  106. azure_ai_evaluation-1.0.0.dist-info/NOTICE.txt +0 -70
  107. azure_ai_evaluation-1.0.0.dist-info/RECORD +0 -119
  108. {azure_ai_evaluation-1.0.0.dist-info → azure_ai_evaluation-1.0.0b1.dist-info}/top_level.txt +0 -0
@@ -6,40 +6,38 @@ import functools
6
6
  import inspect
7
7
  import json
8
8
  import logging
9
- from typing import Callable, Dict, Literal, Optional, Union, cast
9
+ from typing import Callable, Dict
10
10
 
11
11
  import pandas as pd
12
+
12
13
  from promptflow._sdk.entities._flows import FlexFlow as flex_flow
13
14
  from promptflow._sdk.entities._flows import Prompty as prompty_sdk
14
15
  from promptflow._sdk.entities._flows.dag import Flow as dag_flow
15
16
  from promptflow.client import PFClient
16
17
  from promptflow.core import Prompty as prompty_core
17
- from typing_extensions import ParamSpec
18
-
19
- from azure.ai.evaluation._model_configurations import AzureAIProject, EvaluationResult
20
18
 
21
19
  from ..._user_agent import USER_AGENT
22
20
  from .._utils import _trace_destination_from_project_scope
23
21
 
24
22
  LOGGER = logging.getLogger(__name__)
25
23
 
26
- P = ParamSpec("P")
27
-
28
24
 
29
- def _get_evaluator_type(evaluator: Dict[str, Callable]) -> Literal["content-safety", "built-in", "custom"]:
25
+ def _get_evaluator_type(evaluator: Dict[str, Callable]):
30
26
  """
31
27
  Get evaluator type for telemetry.
32
28
 
33
29
  :param evaluator: The evaluator object
34
30
  :type evaluator: Dict[str, Callable]
35
31
  :return: The evaluator type. Possible values are "built-in", "custom", and "content-safety".
36
- :rtype: Literal["content-safety", "built-in", "custom"]
32
+ :rtype: str
37
33
  """
38
- module = inspect.getmodule(evaluator)
39
- module_name = module.__name__ if module else ""
34
+ built_in = False
35
+ content_safety = False
40
36
 
41
- built_in = module_name.startswith("azure.ai.evaluation._evaluators.")
42
- content_safety = built_in and module_name.startswith("azure.ai.evaluation._evaluators._content_safety")
37
+ module = inspect.getmodule(evaluator)
38
+ built_in = module and module.__name__.startswith("azure.ai.evaluation._evaluators.")
39
+ if built_in:
40
+ content_safety = module.__name__.startswith("azure.ai.evaluation._evaluators._content_safety")
43
41
 
44
42
  if content_safety:
45
43
  return "content-safety"
@@ -84,7 +82,7 @@ def _get_evaluator_properties(evaluator, evaluator_name):
84
82
  name = str(evaluator)
85
83
  pf_type = "Unknown"
86
84
  except Exception as e: # pylint: disable=broad-exception-caught
87
- LOGGER.debug("Failed to get evaluator properties: %s", e)
85
+ LOGGER.debug(f"Failed to get evaluator properties: {e}")
88
86
  name = str(evaluator)
89
87
  pf_type = "Unknown"
90
88
 
@@ -97,22 +95,20 @@ def _get_evaluator_properties(evaluator, evaluator_name):
97
95
 
98
96
 
99
97
  # cspell:ignore isna
100
- def log_evaluate_activity(func: Callable[P, EvaluationResult]) -> Callable[P, EvaluationResult]:
98
+ def log_evaluate_activity(func) -> None:
101
99
  """Decorator to log evaluate activity
102
100
 
103
101
  :param func: The function to be decorated
104
102
  :type func: Callable
105
- :returns: The decorated function
106
- :rtype: Callable[P, EvaluationResult]
107
103
  """
108
104
 
109
105
  @functools.wraps(func)
110
- def wrapper(*args: P.args, **kwargs: P.kwargs) -> EvaluationResult:
106
+ def wrapper(*args, **kwargs) -> Callable:
111
107
  from promptflow._sdk._telemetry import ActivityType, log_activity
112
108
  from promptflow._sdk._telemetry.telemetry import get_telemetry_logger
113
109
 
114
- evaluators = cast(Optional[Dict[str, Callable]], kwargs.get("evaluators", {})) or {}
115
- azure_ai_project = cast(Optional[AzureAIProject], kwargs.get("azure_ai_project", None))
110
+ evaluators = kwargs.get("evaluators", [])
111
+ azure_ai_project = kwargs.get("azure_ai_project", None)
116
112
 
117
113
  pf_client = PFClient(
118
114
  config=(
@@ -123,11 +119,10 @@ def log_evaluate_activity(func: Callable[P, EvaluationResult]) -> Callable[P, Ev
123
119
  user_agent=USER_AGENT,
124
120
  )
125
121
 
126
- trace_destination = pf_client._config.get_trace_destination() # pylint: disable=protected-access
127
- track_in_cloud = bool(trace_destination) if trace_destination != "none" else False
122
+ track_in_cloud = bool(pf_client._config.get_trace_destination())
128
123
  evaluate_target = bool(kwargs.get("target", None))
129
124
  evaluator_config = bool(kwargs.get("evaluator_config", None))
130
- custom_dimensions: Dict[str, Union[str, bool]] = {
125
+ custom_dimensions = {
131
126
  "track_in_cloud": track_in_cloud,
132
127
  "evaluate_target": evaluate_target,
133
128
  "evaluator_config": evaluator_config,
@@ -159,7 +154,7 @@ def log_evaluate_activity(func: Callable[P, EvaluationResult]) -> Callable[P, Ev
159
154
  evaluator_info["failed_rows"] = failed_rows
160
155
  evaluator_info["total_rows"] = total_rows
161
156
  except Exception as e: # pylint: disable=broad-exception-caught
162
- LOGGER.debug("Failed to collect evaluate failed row info for %s: %s", evaluator_name, e)
157
+ LOGGER.debug(f"Failed to collect evaluate failed row info for {evaluator_name}: {e}")
163
158
  evaluators_info.append(evaluator_info)
164
159
 
165
160
  custom_dimensions = {"evaluators_info": json.dumps(evaluators_info)}
@@ -172,7 +167,7 @@ def log_evaluate_activity(func: Callable[P, EvaluationResult]) -> Callable[P, Ev
172
167
  ):
173
168
  pass
174
169
  except Exception as e: # pylint: disable=broad-exception-caught
175
- LOGGER.debug("Failed to collect evaluate usage info: %s", e)
170
+ LOGGER.debug(f"Failed to collect evaluate usage info: {e}")
176
171
 
177
172
  return result
178
173
 
@@ -6,23 +6,14 @@ import logging
6
6
  import os
7
7
  import re
8
8
  import tempfile
9
+ from collections import namedtuple
9
10
  from pathlib import Path
10
- from typing import Any, Dict, NamedTuple, Optional, Tuple, Union
11
- import uuid
12
- import base64
13
11
 
14
12
  import pandas as pd
15
- from promptflow.client import PFClient
16
- from promptflow.entities import Run
17
-
18
- from azure.ai.evaluation._constants import (
19
- DEFAULT_EVALUATION_RESULTS_FILE_NAME,
20
- DefaultOpenEncoding,
21
- EvaluationRunProperties,
22
- Prefixes,
23
- )
24
- from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
25
- from azure.ai.evaluation._model_configurations import AzureAIProject
13
+
14
+ from azure.ai.evaluation._constants import DEFAULT_EVALUATION_RESULTS_FILE_NAME, Prefixes
15
+ from azure.ai.evaluation._exceptions import EvaluationException, ErrorBlame, ErrorCategory, ErrorTarget
16
+ from azure.ai.evaluation._evaluate._eval_run import EvalRun
26
17
 
27
18
  LOGGER = logging.getLogger(__name__)
28
19
 
@@ -31,45 +22,39 @@ AZURE_WORKSPACE_REGEX_FORMAT = (
31
22
  "(/providers/Microsoft.MachineLearningServices)?/workspaces/([^/]+)$"
32
23
  )
33
24
 
34
-
35
- class AzureMLWorkspace(NamedTuple):
36
- subscription_id: str
37
- resource_group_name: str
38
- workspace_name: str
25
+ AzureMLWorkspaceTriad = namedtuple("AzureMLWorkspace", ["subscription_id", "resource_group_name", "workspace_name"])
39
26
 
40
27
 
41
- def is_none(value) -> bool:
28
+ def is_none(value):
42
29
  return value is None or str(value).lower() == "none"
43
30
 
44
31
 
45
- def extract_workspace_triad_from_trace_provider( # pylint: disable=name-too-long
46
- trace_provider: str,
47
- ) -> AzureMLWorkspace:
32
+ def extract_workspace_triad_from_trace_provider(trace_provider: str): # pylint: disable=name-too-long
48
33
  match = re.match(AZURE_WORKSPACE_REGEX_FORMAT, trace_provider)
49
34
  if not match or len(match.groups()) != 5:
50
35
  raise EvaluationException(
51
- message="Malformed trace provider string, expected azureml://subscriptions/<subscription_id>/"
52
- "resourceGroups/<resource_group>/providers/Microsoft.MachineLearningServices/"
53
- f"workspaces/<workspace_name>, got {trace_provider}",
54
- internal_message="Malformed trace provider string, expected azureml://subscriptions/<subscription_id>/"
55
- "resourceGroups/<resource_group>/providers/Microsoft.MachineLearningServices/"
56
- "workspaces/<workspace_name>,",
57
- target=ErrorTarget.UNKNOWN,
58
- category=ErrorCategory.INVALID_VALUE,
59
- blame=ErrorBlame.UNKNOWN,
60
- )
36
+ message="Malformed trace provider string, expected azureml://subscriptions/<subscription_id>/"
37
+ "resourceGroups/<resource_group>/providers/Microsoft.MachineLearningServices/"
38
+ f"workspaces/<workspace_name>, got {trace_provider}",
39
+ internal_message="Malformed trace provider string, expected azureml://subscriptions/<subscription_id>/"
40
+ "resourceGroups/<resource_group>/providers/Microsoft.MachineLearningServices/"
41
+ "workspaces/<workspace_name>,",
42
+ target=ErrorTarget.UNKNOWN,
43
+ category=ErrorCategory.INVALID_VALUE,
44
+ blame=ErrorBlame.UNKNOWN,
45
+ )
61
46
  subscription_id = match.group(1)
62
47
  resource_group_name = match.group(3)
63
48
  workspace_name = match.group(5)
64
- return AzureMLWorkspace(subscription_id, resource_group_name, workspace_name)
49
+ return AzureMLWorkspaceTriad(subscription_id, resource_group_name, workspace_name)
65
50
 
66
51
 
67
52
  def load_jsonl(path):
68
- with open(path, "r", encoding=DefaultOpenEncoding.READ) as f:
53
+ with open(path, "r", encoding="utf-8") as f:
69
54
  return [json.loads(line) for line in f.readlines()]
70
55
 
71
56
 
72
- def _azure_pf_client_and_triad(trace_destination) -> Tuple[PFClient, AzureMLWorkspace]:
57
+ def _azure_pf_client_and_triad(trace_destination):
73
58
  from promptflow.azure._cli._utils import _get_azure_pf_client
74
59
 
75
60
  ws_triad = extract_workspace_triad_from_trace_provider(trace_destination)
@@ -82,45 +67,15 @@ def _azure_pf_client_and_triad(trace_destination) -> Tuple[PFClient, AzureMLWork
82
67
  return azure_pf_client, ws_triad
83
68
 
84
69
 
85
- def _store_multimodal_content(messages, tmpdir: str):
86
- # verify if images folder exists
87
- images_folder_path = os.path.join(tmpdir, "images")
88
- os.makedirs(images_folder_path, exist_ok=True)
89
-
90
- # traverse all messages and replace base64 image data with new file name.
91
- for message in messages:
92
- if isinstance(message.get("content", []), list):
93
- for content in message.get("content", []):
94
- if content.get("type") == "image_url":
95
- image_url = content.get("image_url")
96
- if image_url and "url" in image_url and image_url["url"].startswith("data:image/jpg;base64,"):
97
- # Extract the base64 string
98
- base64image = image_url["url"].replace("data:image/jpg;base64,", "")
99
-
100
- # Generate a unique filename
101
- image_file_name = f"{str(uuid.uuid4())}.jpg"
102
- image_url["url"] = f"images/{image_file_name}" # Replace the base64 URL with the file path
103
-
104
- # Decode the base64 string to binary image data
105
- image_data_binary = base64.b64decode(base64image)
106
-
107
- # Write the binary image data to the file
108
- image_file_path = os.path.join(images_folder_path, image_file_name)
109
- with open(image_file_path, "wb") as f:
110
- f.write(image_data_binary)
111
-
112
-
113
70
  def _log_metrics_and_instance_results(
114
- metrics: Dict[str, Any],
115
- instance_results: pd.DataFrame,
116
- trace_destination: Optional[str],
117
- run: Run,
118
- evaluation_name: Optional[str],
119
- ) -> Optional[str]:
120
- from azure.ai.evaluation._evaluate._eval_run import EvalRun
121
-
71
+ metrics,
72
+ instance_results,
73
+ trace_destination,
74
+ run,
75
+ evaluation_name,
76
+ ) -> str:
122
77
  if trace_destination is None:
123
- LOGGER.debug("Skip uploading evaluation results to AI Studio since no trace destination was provided.")
78
+ LOGGER.error("Unable to log traces as trace destination was not defined.")
124
79
  return None
125
80
 
126
81
  azure_pf_client, ws_triad = _azure_pf_client_and_triad(trace_destination)
@@ -138,21 +93,13 @@ def _log_metrics_and_instance_results(
138
93
  ml_client=azure_pf_client.ml_client,
139
94
  promptflow_run=run,
140
95
  ) as ev_run:
141
- artifact_name = EvalRun.EVALUATION_ARTIFACT
96
+
97
+ artifact_name = EvalRun.EVALUATION_ARTIFACT if run else EvalRun.EVALUATION_ARTIFACT_DUMMY_RUN
142
98
 
143
99
  with tempfile.TemporaryDirectory() as tmpdir:
144
- # storing multi_modal images if exists
145
- col_name = "inputs.conversation"
146
- if col_name in instance_results.columns:
147
- for item in instance_results[col_name].items():
148
- value = item[1]
149
- if "messages" in value:
150
- _store_multimodal_content(value["messages"], tmpdir)
151
-
152
- # storing artifact result
153
100
  tmp_path = os.path.join(tmpdir, artifact_name)
154
101
 
155
- with open(tmp_path, "w", encoding=DefaultOpenEncoding.WRITE) as f:
102
+ with open(tmp_path, "w", encoding="utf-8") as f:
156
103
  f.write(instance_results.to_json(orient="records", lines=True))
157
104
 
158
105
  ev_run.log_artifact(tmpdir, artifact_name)
@@ -164,9 +111,9 @@ def _log_metrics_and_instance_results(
164
111
  if run is None:
165
112
  ev_run.write_properties_to_run_history(
166
113
  properties={
167
- EvaluationRunProperties.RUN_TYPE: "eval_run",
168
- EvaluationRunProperties.EVALUATION_RUN: "promptflow.BatchRun",
114
+ "_azureml.evaluation_run": "azure-ai-generative-parent",
169
115
  "_azureml.evaluate_artifacts": json.dumps([{"path": artifact_name, "type": "table"}]),
116
+ "isEvaluatorRun": "true",
170
117
  }
171
118
  )
172
119
 
@@ -190,7 +137,7 @@ def _get_ai_studio_url(trace_destination: str, evaluation_id: str) -> str:
190
137
  return studio_url
191
138
 
192
139
 
193
- def _trace_destination_from_project_scope(project_scope: AzureAIProject) -> str:
140
+ def _trace_destination_from_project_scope(project_scope: dict) -> str:
194
141
  subscription_id = project_scope["subscription_id"]
195
142
  resource_group_name = project_scope["resource_group_name"]
196
143
  workspace_name = project_scope["project_name"]
@@ -203,20 +150,16 @@ def _trace_destination_from_project_scope(project_scope: AzureAIProject) -> str:
203
150
  return trace_destination
204
151
 
205
152
 
206
- def _write_output(path: Union[str, os.PathLike], data_dict: Any) -> None:
153
+ def _write_output(path, data_dict):
207
154
  p = Path(path)
208
- if p.is_dir():
155
+ if os.path.isdir(path):
209
156
  p = p / DEFAULT_EVALUATION_RESULTS_FILE_NAME
210
157
 
211
- with open(p, "w", encoding=DefaultOpenEncoding.WRITE) as f:
158
+ with open(p, "w") as f:
212
159
  json.dump(data_dict, f)
213
160
 
214
- print(f'Evaluation results saved to "{p.resolve()}".\n')
215
-
216
161
 
217
- def _apply_column_mapping(
218
- source_df: pd.DataFrame, mapping_config: Optional[Dict[str, str]], inplace: bool = False
219
- ) -> pd.DataFrame:
162
+ def _apply_column_mapping(source_df: pd.DataFrame, mapping_config: dict, inplace: bool = False) -> pd.DataFrame:
220
163
  """
221
164
  Apply column mapping to source_df based on mapping_config.
222
165
 
@@ -224,11 +167,10 @@ def _apply_column_mapping(
224
167
  :param source_df: the data frame to be changed.
225
168
  :type source_df: pd.DataFrame
226
169
  :param mapping_config: The configuration, containing column mapping.
227
- :type mapping_config: Dict[str, str].
170
+ :type mapping_config: dict.
228
171
  :param inplace: If true, the source_df will be changed inplace.
229
172
  :type inplace: bool
230
173
  :return: The modified data frame.
231
- :rtype: pd.DataFrame
232
174
  """
233
175
  result_df = source_df
234
176
 
@@ -265,34 +207,31 @@ def _apply_column_mapping(
265
207
  return result_df
266
208
 
267
209
 
268
- def _has_aggregator(evaluator: object) -> bool:
210
+ def _has_aggregator(evaluator):
269
211
  return hasattr(evaluator, "__aggregate__")
270
212
 
271
213
 
272
- def get_int_env_var(env_var_name: str, default_value: int) -> int:
214
+ def get_int_env_var(env_var_name, default_value=None):
273
215
  """
274
- The function `get_int_env_var` retrieves an integer environment variable value, with a
216
+ The function `get_int_env_var` retrieves an integer environment variable value, with an optional
275
217
  default value if the variable is not set or cannot be converted to an integer.
276
218
 
277
219
  :param env_var_name: The name of the environment variable you want to retrieve the value of
278
- :type env_var_name: str
279
220
  :param default_value: The default value is the value that will be returned if the environment
280
- variable is not found or if it cannot be converted to an integer
281
- :type default_value: int
221
+ variable is not found or if it cannot be converted to an integer
282
222
  :return: an integer value.
283
- :rtype: int
284
223
  """
285
224
  try:
286
- return int(os.environ[env_var_name])
287
- except (ValueError, KeyError):
225
+ return int(os.environ.get(env_var_name, default_value))
226
+ except Exception:
288
227
  return default_value
289
228
 
290
229
 
291
- def set_event_loop_policy() -> None:
230
+ def set_event_loop_policy():
292
231
  import asyncio
293
232
  import platform
294
233
 
295
234
  if platform.system().lower() == "windows":
296
235
  # Reference: https://stackoverflow.com/questions/45600579/asyncio-event-loop-is-closed-when-getting-loop
297
236
  # On Windows seems to be a problem with EventLoopPolicy, use this snippet to work around it
298
- asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy()) # type: ignore[attr-defined]
237
+ asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())
@@ -2,8 +2,8 @@
2
2
  # Copyright (c) Microsoft Corporation. All rights reserved.
3
3
  # ---------------------------------------------------------
4
4
  from nltk.translate.bleu_score import SmoothingFunction, sentence_bleu
5
- from promptflow._utils.async_utils import async_run_allowing_running_loop
6
5
 
6
+ from promptflow._utils.async_utils import async_run_allowing_running_loop
7
7
  from azure.ai.evaluation._common.utils import nltk_tokenize
8
8
 
9
9
 
@@ -26,29 +26,30 @@ class _AsyncBleuScoreEvaluator:
26
26
 
27
27
  class BleuScoreEvaluator:
28
28
  """
29
- Calculate the BLEU score for a given response and ground truth.
29
+ Evaluator that computes the BLEU Score between two strings.
30
30
 
31
31
  BLEU (Bilingual Evaluation Understudy) score is commonly used in natural language processing (NLP) and machine
32
- translation. It is widely used in text summarization and text generation use cases.
32
+ translation. It is widely used in text summarization and text generation use cases. It evaluates how closely the
33
+ generated text matches the reference text. The BLEU score ranges from 0 to 1, with higher scores indicating
34
+ better quality.
33
35
 
34
- Use the BLEU score when you want to evaluate the similarity between the generated text and reference text,
35
- especially in tasks such as machine translation or text summarization, where n-gram overlap is a significant
36
- indicator of quality.
36
+ **Usage**
37
37
 
38
- The BLEU score ranges from 0 to 1, with higher scores indicating better quality.
38
+ .. code-block:: python
39
39
 
40
- .. admonition:: Example:
40
+ eval_fn = BleuScoreEvaluator()
41
+ result = eval_fn(
42
+ response="Tokyo is the capital of Japan.",
43
+ ground_truth="The capital of Japan is Tokyo.")
41
44
 
42
- .. literalinclude:: ../samples/evaluation_samples_evaluate.py
43
- :start-after: [START bleu_score_evaluator]
44
- :end-before: [END bleu_score_evaluator]
45
- :language: python
46
- :dedent: 8
47
- :caption: Initialize and call an BleuScoreEvaluator.
48
- """
45
+ **Output format**
49
46
 
50
- id = "azureml://registries/azureml/models/Bleu-Score-Evaluator/versions/3"
51
- """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
47
+ .. code-block:: python
48
+
49
+ {
50
+ "bleu_score": 0.22
51
+ }
52
+ """
52
53
 
53
54
  def __init__(self):
54
55
  self._async_evaluator = _AsyncBleuScoreEvaluator()
@@ -62,7 +63,7 @@ class BleuScoreEvaluator:
62
63
  :keyword ground_truth: The ground truth to be compared against.
63
64
  :paramtype ground_truth: str
64
65
  :return: The BLEU score.
65
- :rtype: Dict[str, float]
66
+ :rtype: dict
66
67
  """
67
68
  return async_run_allowing_running_loop(
68
69
  self._async_evaluator, response=response, ground_truth=ground_truth, **kwargs
@@ -2,8 +2,8 @@
2
2
  # Copyright (c) Microsoft Corporation. All rights reserved.
3
3
  # ---------------------------------------------------------
4
4
 
5
- from ._retrieval import RetrievalEvaluator
5
+ from ._chat import ChatEvaluator
6
6
 
7
7
  __all__ = [
8
- "RetrievalEvaluator",
8
+ "ChatEvaluator",
9
9
  ]