azure-ai-evaluation 1.0.0__py3-none-any.whl → 1.0.0b2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of azure-ai-evaluation might be problematic. Click here for more details.

Files changed (105) hide show
  1. azure/ai/evaluation/__init__.py +5 -31
  2. azure/ai/evaluation/_common/constants.py +2 -9
  3. azure/ai/evaluation/_common/rai_service.py +120 -300
  4. azure/ai/evaluation/_common/utils.py +23 -381
  5. azure/ai/evaluation/_constants.py +6 -19
  6. azure/ai/evaluation/_evaluate/{_batch_run → _batch_run_client}/__init__.py +2 -3
  7. azure/ai/evaluation/_evaluate/{_batch_run/eval_run_context.py → _batch_run_client/batch_run_context.py} +7 -23
  8. azure/ai/evaluation/_evaluate/{_batch_run → _batch_run_client}/code_client.py +17 -33
  9. azure/ai/evaluation/_evaluate/{_batch_run → _batch_run_client}/proxy_client.py +4 -32
  10. azure/ai/evaluation/_evaluate/_eval_run.py +24 -81
  11. azure/ai/evaluation/_evaluate/_evaluate.py +239 -393
  12. azure/ai/evaluation/_evaluate/_telemetry/__init__.py +17 -17
  13. azure/ai/evaluation/_evaluate/_utils.py +28 -82
  14. azure/ai/evaluation/_evaluators/_bleu/_bleu.py +18 -17
  15. azure/ai/evaluation/_evaluators/{_retrieval → _chat}/__init__.py +2 -2
  16. azure/ai/evaluation/_evaluators/_chat/_chat.py +357 -0
  17. azure/ai/evaluation/_evaluators/{_service_groundedness → _chat/retrieval}/__init__.py +2 -2
  18. azure/ai/evaluation/_evaluators/_chat/retrieval/_retrieval.py +157 -0
  19. azure/ai/evaluation/_evaluators/_chat/retrieval/retrieval.prompty +48 -0
  20. azure/ai/evaluation/_evaluators/_coherence/_coherence.py +88 -78
  21. azure/ai/evaluation/_evaluators/_coherence/coherence.prompty +39 -76
  22. azure/ai/evaluation/_evaluators/_content_safety/__init__.py +4 -0
  23. azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py +67 -105
  24. azure/ai/evaluation/_evaluators/{_multimodal/_content_safety_multimodal_base.py → _content_safety/_content_safety_base.py} +34 -24
  25. azure/ai/evaluation/_evaluators/_content_safety/_content_safety_chat.py +301 -0
  26. azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py +54 -105
  27. azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py +52 -99
  28. azure/ai/evaluation/_evaluators/_content_safety/_sexual.py +52 -101
  29. azure/ai/evaluation/_evaluators/_content_safety/_violence.py +51 -101
  30. azure/ai/evaluation/_evaluators/_eci/_eci.py +54 -44
  31. azure/ai/evaluation/_evaluators/_f1_score/_f1_score.py +19 -34
  32. azure/ai/evaluation/_evaluators/_fluency/_fluency.py +89 -76
  33. azure/ai/evaluation/_evaluators/_fluency/fluency.prompty +41 -66
  34. azure/ai/evaluation/_evaluators/_gleu/_gleu.py +16 -14
  35. azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +87 -113
  36. azure/ai/evaluation/_evaluators/_groundedness/groundedness.prompty +54 -0
  37. azure/ai/evaluation/_evaluators/_meteor/_meteor.py +27 -20
  38. azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py +80 -89
  39. azure/ai/evaluation/_evaluators/_protected_materials/__init__.py +5 -0
  40. azure/ai/evaluation/_evaluators/_protected_materials/_protected_materials.py +104 -0
  41. azure/ai/evaluation/_evaluators/_qa/_qa.py +30 -23
  42. azure/ai/evaluation/_evaluators/_relevance/_relevance.py +96 -84
  43. azure/ai/evaluation/_evaluators/_relevance/relevance.prompty +47 -78
  44. azure/ai/evaluation/_evaluators/_rouge/_rouge.py +27 -26
  45. azure/ai/evaluation/_evaluators/_similarity/_similarity.py +38 -53
  46. azure/ai/evaluation/_evaluators/_similarity/similarity.prompty +5 -0
  47. azure/ai/evaluation/_evaluators/_xpia/xpia.py +105 -91
  48. azure/ai/evaluation/_exceptions.py +7 -28
  49. azure/ai/evaluation/_http_utils.py +132 -203
  50. azure/ai/evaluation/_model_configurations.py +8 -104
  51. azure/ai/evaluation/_version.py +1 -1
  52. azure/ai/evaluation/simulator/__init__.py +1 -2
  53. azure/ai/evaluation/simulator/_adversarial_scenario.py +1 -20
  54. azure/ai/evaluation/simulator/_adversarial_simulator.py +92 -111
  55. azure/ai/evaluation/simulator/_constants.py +1 -11
  56. azure/ai/evaluation/simulator/_conversation/__init__.py +12 -13
  57. azure/ai/evaluation/simulator/_conversation/_conversation.py +4 -4
  58. azure/ai/evaluation/simulator/_direct_attack_simulator.py +67 -33
  59. azure/ai/evaluation/simulator/_helpers/__init__.py +2 -1
  60. azure/ai/evaluation/{_common → simulator/_helpers}/_experimental.py +9 -24
  61. azure/ai/evaluation/simulator/_helpers/_simulator_data_classes.py +5 -26
  62. azure/ai/evaluation/simulator/_indirect_attack_simulator.py +94 -107
  63. azure/ai/evaluation/simulator/_model_tools/_identity_manager.py +22 -70
  64. azure/ai/evaluation/simulator/_model_tools/_proxy_completion_model.py +11 -28
  65. azure/ai/evaluation/simulator/_model_tools/_rai_client.py +4 -8
  66. azure/ai/evaluation/simulator/_model_tools/_template_handler.py +24 -68
  67. azure/ai/evaluation/simulator/_model_tools/models.py +10 -10
  68. azure/ai/evaluation/simulator/_prompty/task_query_response.prompty +10 -6
  69. azure/ai/evaluation/simulator/_prompty/task_simulate.prompty +5 -6
  70. azure/ai/evaluation/simulator/_simulator.py +207 -277
  71. azure/ai/evaluation/simulator/_tracing.py +4 -4
  72. azure/ai/evaluation/simulator/_utils.py +13 -31
  73. azure_ai_evaluation-1.0.0b2.dist-info/METADATA +449 -0
  74. azure_ai_evaluation-1.0.0b2.dist-info/RECORD +99 -0
  75. {azure_ai_evaluation-1.0.0.dist-info → azure_ai_evaluation-1.0.0b2.dist-info}/WHEEL +1 -1
  76. azure/ai/evaluation/_common/math.py +0 -89
  77. azure/ai/evaluation/_evaluate/_batch_run/target_run_context.py +0 -46
  78. azure/ai/evaluation/_evaluators/_common/__init__.py +0 -13
  79. azure/ai/evaluation/_evaluators/_common/_base_eval.py +0 -344
  80. azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +0 -88
  81. azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py +0 -133
  82. azure/ai/evaluation/_evaluators/_groundedness/groundedness_with_query.prompty +0 -113
  83. azure/ai/evaluation/_evaluators/_groundedness/groundedness_without_query.prompty +0 -99
  84. azure/ai/evaluation/_evaluators/_multimodal/__init__.py +0 -20
  85. azure/ai/evaluation/_evaluators/_multimodal/_content_safety_multimodal.py +0 -132
  86. azure/ai/evaluation/_evaluators/_multimodal/_hate_unfairness.py +0 -100
  87. azure/ai/evaluation/_evaluators/_multimodal/_protected_material.py +0 -124
  88. azure/ai/evaluation/_evaluators/_multimodal/_self_harm.py +0 -100
  89. azure/ai/evaluation/_evaluators/_multimodal/_sexual.py +0 -100
  90. azure/ai/evaluation/_evaluators/_multimodal/_violence.py +0 -100
  91. azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py +0 -112
  92. azure/ai/evaluation/_evaluators/_retrieval/retrieval.prompty +0 -93
  93. azure/ai/evaluation/_evaluators/_service_groundedness/_service_groundedness.py +0 -148
  94. azure/ai/evaluation/_vendor/__init__.py +0 -3
  95. azure/ai/evaluation/_vendor/rouge_score/__init__.py +0 -14
  96. azure/ai/evaluation/_vendor/rouge_score/rouge_scorer.py +0 -328
  97. azure/ai/evaluation/_vendor/rouge_score/scoring.py +0 -63
  98. azure/ai/evaluation/_vendor/rouge_score/tokenize.py +0 -63
  99. azure/ai/evaluation/_vendor/rouge_score/tokenizers.py +0 -53
  100. azure/ai/evaluation/simulator/_data_sources/__init__.py +0 -3
  101. azure/ai/evaluation/simulator/_data_sources/grounding.json +0 -1150
  102. azure_ai_evaluation-1.0.0.dist-info/METADATA +0 -595
  103. azure_ai_evaluation-1.0.0.dist-info/NOTICE.txt +0 -70
  104. azure_ai_evaluation-1.0.0.dist-info/RECORD +0 -119
  105. {azure_ai_evaluation-1.0.0.dist-info → azure_ai_evaluation-1.0.0b2.dist-info}/top_level.txt +0 -0
@@ -6,7 +6,7 @@ import functools
6
6
  import inspect
7
7
  import json
8
8
  import logging
9
- from typing import Callable, Dict, Literal, Optional, Union, cast
9
+ from typing import Callable, Dict, TypeVar
10
10
 
11
11
  import pandas as pd
12
12
  from promptflow._sdk.entities._flows import FlexFlow as flex_flow
@@ -16,30 +16,31 @@ from promptflow.client import PFClient
16
16
  from promptflow.core import Prompty as prompty_core
17
17
  from typing_extensions import ParamSpec
18
18
 
19
- from azure.ai.evaluation._model_configurations import AzureAIProject, EvaluationResult
20
-
21
19
  from ..._user_agent import USER_AGENT
22
20
  from .._utils import _trace_destination_from_project_scope
23
21
 
24
22
  LOGGER = logging.getLogger(__name__)
25
23
 
26
24
  P = ParamSpec("P")
25
+ R = TypeVar("R")
27
26
 
28
27
 
29
- def _get_evaluator_type(evaluator: Dict[str, Callable]) -> Literal["content-safety", "built-in", "custom"]:
28
+ def _get_evaluator_type(evaluator: Dict[str, Callable]):
30
29
  """
31
30
  Get evaluator type for telemetry.
32
31
 
33
32
  :param evaluator: The evaluator object
34
33
  :type evaluator: Dict[str, Callable]
35
34
  :return: The evaluator type. Possible values are "built-in", "custom", and "content-safety".
36
- :rtype: Literal["content-safety", "built-in", "custom"]
35
+ :rtype: str
37
36
  """
38
- module = inspect.getmodule(evaluator)
39
- module_name = module.__name__ if module else ""
37
+ built_in = False
38
+ content_safety = False
40
39
 
41
- built_in = module_name.startswith("azure.ai.evaluation._evaluators.")
42
- content_safety = built_in and module_name.startswith("azure.ai.evaluation._evaluators._content_safety")
40
+ module = inspect.getmodule(evaluator)
41
+ built_in = module and module.__name__.startswith("azure.ai.evaluation._evaluators.")
42
+ if built_in:
43
+ content_safety = module.__name__.startswith("azure.ai.evaluation._evaluators._content_safety")
43
44
 
44
45
  if content_safety:
45
46
  return "content-safety"
@@ -97,22 +98,22 @@ def _get_evaluator_properties(evaluator, evaluator_name):
97
98
 
98
99
 
99
100
  # cspell:ignore isna
100
- def log_evaluate_activity(func: Callable[P, EvaluationResult]) -> Callable[P, EvaluationResult]:
101
+ def log_evaluate_activity(func: Callable[P, R]) -> Callable[P, R]:
101
102
  """Decorator to log evaluate activity
102
103
 
103
104
  :param func: The function to be decorated
104
105
  :type func: Callable
105
106
  :returns: The decorated function
106
- :rtype: Callable[P, EvaluationResult]
107
+ :rtype: Callable[P, R]
107
108
  """
108
109
 
109
110
  @functools.wraps(func)
110
- def wrapper(*args: P.args, **kwargs: P.kwargs) -> EvaluationResult:
111
+ def wrapper(*args: P.args, **kwargs: P.kwargs) -> R:
111
112
  from promptflow._sdk._telemetry import ActivityType, log_activity
112
113
  from promptflow._sdk._telemetry.telemetry import get_telemetry_logger
113
114
 
114
- evaluators = cast(Optional[Dict[str, Callable]], kwargs.get("evaluators", {})) or {}
115
- azure_ai_project = cast(Optional[AzureAIProject], kwargs.get("azure_ai_project", None))
115
+ evaluators = kwargs.get("evaluators", [])
116
+ azure_ai_project = kwargs.get("azure_ai_project", None)
116
117
 
117
118
  pf_client = PFClient(
118
119
  config=(
@@ -123,11 +124,10 @@ def log_evaluate_activity(func: Callable[P, EvaluationResult]) -> Callable[P, Ev
123
124
  user_agent=USER_AGENT,
124
125
  )
125
126
 
126
- trace_destination = pf_client._config.get_trace_destination() # pylint: disable=protected-access
127
- track_in_cloud = bool(trace_destination) if trace_destination != "none" else False
127
+ track_in_cloud = bool(pf_client._config.get_trace_destination()) # pylint: disable=protected-access
128
128
  evaluate_target = bool(kwargs.get("target", None))
129
129
  evaluator_config = bool(kwargs.get("evaluator_config", None))
130
- custom_dimensions: Dict[str, Union[str, bool]] = {
130
+ custom_dimensions = {
131
131
  "track_in_cloud": track_in_cloud,
132
132
  "evaluate_target": evaluate_target,
133
133
  "evaluator_config": evaluator_config,
@@ -6,23 +6,15 @@ import logging
6
6
  import os
7
7
  import re
8
8
  import tempfile
9
+ from collections import namedtuple
9
10
  from pathlib import Path
10
- from typing import Any, Dict, NamedTuple, Optional, Tuple, Union
11
- import uuid
12
- import base64
11
+ from typing import Dict
13
12
 
14
13
  import pandas as pd
15
- from promptflow.client import PFClient
16
- from promptflow.entities import Run
17
-
18
- from azure.ai.evaluation._constants import (
19
- DEFAULT_EVALUATION_RESULTS_FILE_NAME,
20
- DefaultOpenEncoding,
21
- EvaluationRunProperties,
22
- Prefixes,
23
- )
14
+
15
+ from azure.ai.evaluation._constants import DEFAULT_EVALUATION_RESULTS_FILE_NAME, DefaultOpenEncoding, Prefixes
16
+ from azure.ai.evaluation._evaluate._eval_run import EvalRun
24
17
  from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
25
- from azure.ai.evaluation._model_configurations import AzureAIProject
26
18
 
27
19
  LOGGER = logging.getLogger(__name__)
28
20
 
@@ -31,20 +23,14 @@ AZURE_WORKSPACE_REGEX_FORMAT = (
31
23
  "(/providers/Microsoft.MachineLearningServices)?/workspaces/([^/]+)$"
32
24
  )
33
25
 
34
-
35
- class AzureMLWorkspace(NamedTuple):
36
- subscription_id: str
37
- resource_group_name: str
38
- workspace_name: str
26
+ AzureMLWorkspaceTriad = namedtuple("AzureMLWorkspace", ["subscription_id", "resource_group_name", "workspace_name"])
39
27
 
40
28
 
41
- def is_none(value) -> bool:
29
+ def is_none(value):
42
30
  return value is None or str(value).lower() == "none"
43
31
 
44
32
 
45
- def extract_workspace_triad_from_trace_provider( # pylint: disable=name-too-long
46
- trace_provider: str,
47
- ) -> AzureMLWorkspace:
33
+ def extract_workspace_triad_from_trace_provider(trace_provider: str): # pylint: disable=name-too-long
48
34
  match = re.match(AZURE_WORKSPACE_REGEX_FORMAT, trace_provider)
49
35
  if not match or len(match.groups()) != 5:
50
36
  raise EvaluationException(
@@ -61,7 +47,7 @@ def extract_workspace_triad_from_trace_provider( # pylint: disable=name-too-lon
61
47
  subscription_id = match.group(1)
62
48
  resource_group_name = match.group(3)
63
49
  workspace_name = match.group(5)
64
- return AzureMLWorkspace(subscription_id, resource_group_name, workspace_name)
50
+ return AzureMLWorkspaceTriad(subscription_id, resource_group_name, workspace_name)
65
51
 
66
52
 
67
53
  def load_jsonl(path):
@@ -69,7 +55,7 @@ def load_jsonl(path):
69
55
  return [json.loads(line) for line in f.readlines()]
70
56
 
71
57
 
72
- def _azure_pf_client_and_triad(trace_destination) -> Tuple[PFClient, AzureMLWorkspace]:
58
+ def _azure_pf_client_and_triad(trace_destination):
73
59
  from promptflow.azure._cli._utils import _get_azure_pf_client
74
60
 
75
61
  ws_triad = extract_workspace_triad_from_trace_provider(trace_destination)
@@ -82,45 +68,15 @@ def _azure_pf_client_and_triad(trace_destination) -> Tuple[PFClient, AzureMLWork
82
68
  return azure_pf_client, ws_triad
83
69
 
84
70
 
85
- def _store_multimodal_content(messages, tmpdir: str):
86
- # verify if images folder exists
87
- images_folder_path = os.path.join(tmpdir, "images")
88
- os.makedirs(images_folder_path, exist_ok=True)
89
-
90
- # traverse all messages and replace base64 image data with new file name.
91
- for message in messages:
92
- if isinstance(message.get("content", []), list):
93
- for content in message.get("content", []):
94
- if content.get("type") == "image_url":
95
- image_url = content.get("image_url")
96
- if image_url and "url" in image_url and image_url["url"].startswith("data:image/jpg;base64,"):
97
- # Extract the base64 string
98
- base64image = image_url["url"].replace("data:image/jpg;base64,", "")
99
-
100
- # Generate a unique filename
101
- image_file_name = f"{str(uuid.uuid4())}.jpg"
102
- image_url["url"] = f"images/{image_file_name}" # Replace the base64 URL with the file path
103
-
104
- # Decode the base64 string to binary image data
105
- image_data_binary = base64.b64decode(base64image)
106
-
107
- # Write the binary image data to the file
108
- image_file_path = os.path.join(images_folder_path, image_file_name)
109
- with open(image_file_path, "wb") as f:
110
- f.write(image_data_binary)
111
-
112
-
113
71
  def _log_metrics_and_instance_results(
114
- metrics: Dict[str, Any],
115
- instance_results: pd.DataFrame,
116
- trace_destination: Optional[str],
117
- run: Run,
118
- evaluation_name: Optional[str],
119
- ) -> Optional[str]:
120
- from azure.ai.evaluation._evaluate._eval_run import EvalRun
121
-
72
+ metrics,
73
+ instance_results,
74
+ trace_destination,
75
+ run,
76
+ evaluation_name,
77
+ ) -> str:
122
78
  if trace_destination is None:
123
- LOGGER.debug("Skip uploading evaluation results to AI Studio since no trace destination was provided.")
79
+ LOGGER.error("Unable to log traces as trace destination was not defined.")
124
80
  return None
125
81
 
126
82
  azure_pf_client, ws_triad = _azure_pf_client_and_triad(trace_destination)
@@ -138,18 +94,10 @@ def _log_metrics_and_instance_results(
138
94
  ml_client=azure_pf_client.ml_client,
139
95
  promptflow_run=run,
140
96
  ) as ev_run:
141
- artifact_name = EvalRun.EVALUATION_ARTIFACT
97
+
98
+ artifact_name = EvalRun.EVALUATION_ARTIFACT if run else EvalRun.EVALUATION_ARTIFACT_DUMMY_RUN
142
99
 
143
100
  with tempfile.TemporaryDirectory() as tmpdir:
144
- # storing multi_modal images if exists
145
- col_name = "inputs.conversation"
146
- if col_name in instance_results.columns:
147
- for item in instance_results[col_name].items():
148
- value = item[1]
149
- if "messages" in value:
150
- _store_multimodal_content(value["messages"], tmpdir)
151
-
152
- # storing artifact result
153
101
  tmp_path = os.path.join(tmpdir, artifact_name)
154
102
 
155
103
  with open(tmp_path, "w", encoding=DefaultOpenEncoding.WRITE) as f:
@@ -164,9 +112,9 @@ def _log_metrics_and_instance_results(
164
112
  if run is None:
165
113
  ev_run.write_properties_to_run_history(
166
114
  properties={
167
- EvaluationRunProperties.RUN_TYPE: "eval_run",
168
- EvaluationRunProperties.EVALUATION_RUN: "promptflow.BatchRun",
115
+ "_azureml.evaluation_run": "azure-ai-generative-parent",
169
116
  "_azureml.evaluate_artifacts": json.dumps([{"path": artifact_name, "type": "table"}]),
117
+ "isEvaluatorRun": "true",
170
118
  }
171
119
  )
172
120
 
@@ -190,7 +138,7 @@ def _get_ai_studio_url(trace_destination: str, evaluation_id: str) -> str:
190
138
  return studio_url
191
139
 
192
140
 
193
- def _trace_destination_from_project_scope(project_scope: AzureAIProject) -> str:
141
+ def _trace_destination_from_project_scope(project_scope: dict) -> str:
194
142
  subscription_id = project_scope["subscription_id"]
195
143
  resource_group_name = project_scope["resource_group_name"]
196
144
  workspace_name = project_scope["project_name"]
@@ -203,19 +151,17 @@ def _trace_destination_from_project_scope(project_scope: AzureAIProject) -> str:
203
151
  return trace_destination
204
152
 
205
153
 
206
- def _write_output(path: Union[str, os.PathLike], data_dict: Any) -> None:
154
+ def _write_output(path, data_dict):
207
155
  p = Path(path)
208
- if p.is_dir():
156
+ if os.path.isdir(path):
209
157
  p = p / DEFAULT_EVALUATION_RESULTS_FILE_NAME
210
158
 
211
159
  with open(p, "w", encoding=DefaultOpenEncoding.WRITE) as f:
212
160
  json.dump(data_dict, f)
213
161
 
214
- print(f'Evaluation results saved to "{p.resolve()}".\n')
215
-
216
162
 
217
163
  def _apply_column_mapping(
218
- source_df: pd.DataFrame, mapping_config: Optional[Dict[str, str]], inplace: bool = False
164
+ source_df: pd.DataFrame, mapping_config: Dict[str, str], inplace: bool = False
219
165
  ) -> pd.DataFrame:
220
166
  """
221
167
  Apply column mapping to source_df based on mapping_config.
@@ -265,7 +211,7 @@ def _apply_column_mapping(
265
211
  return result_df
266
212
 
267
213
 
268
- def _has_aggregator(evaluator: object) -> bool:
214
+ def _has_aggregator(evaluator):
269
215
  return hasattr(evaluator, "__aggregate__")
270
216
 
271
217
 
@@ -288,11 +234,11 @@ def get_int_env_var(env_var_name: str, default_value: int) -> int:
288
234
  return default_value
289
235
 
290
236
 
291
- def set_event_loop_policy() -> None:
237
+ def set_event_loop_policy():
292
238
  import asyncio
293
239
  import platform
294
240
 
295
241
  if platform.system().lower() == "windows":
296
242
  # Reference: https://stackoverflow.com/questions/45600579/asyncio-event-loop-is-closed-when-getting-loop
297
243
  # On Windows seems to be a problem with EventLoopPolicy, use this snippet to work around it
298
- asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy()) # type: ignore[attr-defined]
244
+ asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())
@@ -26,29 +26,30 @@ class _AsyncBleuScoreEvaluator:
26
26
 
27
27
  class BleuScoreEvaluator:
28
28
  """
29
- Calculate the BLEU score for a given response and ground truth.
29
+ Evaluator that computes the BLEU Score between two strings.
30
30
 
31
31
  BLEU (Bilingual Evaluation Understudy) score is commonly used in natural language processing (NLP) and machine
32
- translation. It is widely used in text summarization and text generation use cases.
32
+ translation. It is widely used in text summarization and text generation use cases. It evaluates how closely the
33
+ generated text matches the reference text. The BLEU score ranges from 0 to 1, with higher scores indicating
34
+ better quality.
33
35
 
34
- Use the BLEU score when you want to evaluate the similarity between the generated text and reference text,
35
- especially in tasks such as machine translation or text summarization, where n-gram overlap is a significant
36
- indicator of quality.
36
+ **Usage**
37
37
 
38
- The BLEU score ranges from 0 to 1, with higher scores indicating better quality.
38
+ .. code-block:: python
39
39
 
40
- .. admonition:: Example:
40
+ eval_fn = BleuScoreEvaluator()
41
+ result = eval_fn(
42
+ response="Tokyo is the capital of Japan.",
43
+ ground_truth="The capital of Japan is Tokyo.")
41
44
 
42
- .. literalinclude:: ../samples/evaluation_samples_evaluate.py
43
- :start-after: [START bleu_score_evaluator]
44
- :end-before: [END bleu_score_evaluator]
45
- :language: python
46
- :dedent: 8
47
- :caption: Initialize and call an BleuScoreEvaluator.
48
- """
45
+ **Output format**
46
+
47
+ .. code-block:: python
49
48
 
50
- id = "azureml://registries/azureml/models/Bleu-Score-Evaluator/versions/3"
51
- """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
49
+ {
50
+ "bleu_score": 0.22
51
+ }
52
+ """
52
53
 
53
54
  def __init__(self):
54
55
  self._async_evaluator = _AsyncBleuScoreEvaluator()
@@ -62,7 +63,7 @@ class BleuScoreEvaluator:
62
63
  :keyword ground_truth: The ground truth to be compared against.
63
64
  :paramtype ground_truth: str
64
65
  :return: The BLEU score.
65
- :rtype: Dict[str, float]
66
+ :rtype: dict
66
67
  """
67
68
  return async_run_allowing_running_loop(
68
69
  self._async_evaluator, response=response, ground_truth=ground_truth, **kwargs
@@ -2,8 +2,8 @@
2
2
  # Copyright (c) Microsoft Corporation. All rights reserved.
3
3
  # ---------------------------------------------------------
4
4
 
5
- from ._retrieval import RetrievalEvaluator
5
+ from ._chat import ChatEvaluator
6
6
 
7
7
  __all__ = [
8
- "RetrievalEvaluator",
8
+ "ChatEvaluator",
9
9
  ]