azure-ai-evaluation 0.0.0b0__py3-none-any.whl → 1.0.0b1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of azure-ai-evaluation might be problematic. Click here for more details.

Files changed (100) hide show
  1. azure/ai/evaluation/__init__.py +60 -0
  2. azure/ai/evaluation/_common/__init__.py +16 -0
  3. azure/ai/evaluation/_common/constants.py +65 -0
  4. azure/ai/evaluation/_common/rai_service.py +452 -0
  5. azure/ai/evaluation/_common/utils.py +87 -0
  6. azure/ai/evaluation/_constants.py +50 -0
  7. azure/ai/evaluation/_evaluate/__init__.py +3 -0
  8. azure/ai/evaluation/_evaluate/_batch_run_client/__init__.py +8 -0
  9. azure/ai/evaluation/_evaluate/_batch_run_client/batch_run_context.py +72 -0
  10. azure/ai/evaluation/_evaluate/_batch_run_client/code_client.py +150 -0
  11. azure/ai/evaluation/_evaluate/_batch_run_client/proxy_client.py +61 -0
  12. azure/ai/evaluation/_evaluate/_eval_run.py +494 -0
  13. azure/ai/evaluation/_evaluate/_evaluate.py +689 -0
  14. azure/ai/evaluation/_evaluate/_telemetry/__init__.py +174 -0
  15. azure/ai/evaluation/_evaluate/_utils.py +237 -0
  16. azure/ai/evaluation/_evaluators/__init__.py +3 -0
  17. azure/ai/evaluation/_evaluators/_bleu/__init__.py +9 -0
  18. azure/ai/evaluation/_evaluators/_bleu/_bleu.py +73 -0
  19. azure/ai/evaluation/_evaluators/_chat/__init__.py +9 -0
  20. azure/ai/evaluation/_evaluators/_chat/_chat.py +350 -0
  21. azure/ai/evaluation/_evaluators/_chat/retrieval/__init__.py +9 -0
  22. azure/ai/evaluation/_evaluators/_chat/retrieval/_retrieval.py +163 -0
  23. azure/ai/evaluation/_evaluators/_chat/retrieval/retrieval.prompty +48 -0
  24. azure/ai/evaluation/_evaluators/_coherence/__init__.py +7 -0
  25. azure/ai/evaluation/_evaluators/_coherence/_coherence.py +122 -0
  26. azure/ai/evaluation/_evaluators/_coherence/coherence.prompty +62 -0
  27. azure/ai/evaluation/_evaluators/_content_safety/__init__.py +21 -0
  28. azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py +108 -0
  29. azure/ai/evaluation/_evaluators/_content_safety/_content_safety_base.py +66 -0
  30. azure/ai/evaluation/_evaluators/_content_safety/_content_safety_chat.py +296 -0
  31. azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py +78 -0
  32. azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py +76 -0
  33. azure/ai/evaluation/_evaluators/_content_safety/_sexual.py +76 -0
  34. azure/ai/evaluation/_evaluators/_content_safety/_violence.py +76 -0
  35. azure/ai/evaluation/_evaluators/_eci/__init__.py +0 -0
  36. azure/ai/evaluation/_evaluators/_eci/_eci.py +99 -0
  37. azure/ai/evaluation/_evaluators/_f1_score/__init__.py +9 -0
  38. azure/ai/evaluation/_evaluators/_f1_score/_f1_score.py +141 -0
  39. azure/ai/evaluation/_evaluators/_fluency/__init__.py +9 -0
  40. azure/ai/evaluation/_evaluators/_fluency/_fluency.py +122 -0
  41. azure/ai/evaluation/_evaluators/_fluency/fluency.prompty +61 -0
  42. azure/ai/evaluation/_evaluators/_gleu/__init__.py +9 -0
  43. azure/ai/evaluation/_evaluators/_gleu/_gleu.py +71 -0
  44. azure/ai/evaluation/_evaluators/_groundedness/__init__.py +9 -0
  45. azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +123 -0
  46. azure/ai/evaluation/_evaluators/_groundedness/groundedness.prompty +54 -0
  47. azure/ai/evaluation/_evaluators/_meteor/__init__.py +9 -0
  48. azure/ai/evaluation/_evaluators/_meteor/_meteor.py +96 -0
  49. azure/ai/evaluation/_evaluators/_protected_material/__init__.py +5 -0
  50. azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py +104 -0
  51. azure/ai/evaluation/_evaluators/_protected_materials/__init__.py +5 -0
  52. azure/ai/evaluation/_evaluators/_protected_materials/_protected_materials.py +104 -0
  53. azure/ai/evaluation/_evaluators/_qa/__init__.py +9 -0
  54. azure/ai/evaluation/_evaluators/_qa/_qa.py +111 -0
  55. azure/ai/evaluation/_evaluators/_relevance/__init__.py +9 -0
  56. azure/ai/evaluation/_evaluators/_relevance/_relevance.py +131 -0
  57. azure/ai/evaluation/_evaluators/_relevance/relevance.prompty +69 -0
  58. azure/ai/evaluation/_evaluators/_rouge/__init__.py +10 -0
  59. azure/ai/evaluation/_evaluators/_rouge/_rouge.py +98 -0
  60. azure/ai/evaluation/_evaluators/_similarity/__init__.py +9 -0
  61. azure/ai/evaluation/_evaluators/_similarity/_similarity.py +130 -0
  62. azure/ai/evaluation/_evaluators/_similarity/similarity.prompty +71 -0
  63. azure/ai/evaluation/_evaluators/_xpia/__init__.py +5 -0
  64. azure/ai/evaluation/_evaluators/_xpia/xpia.py +140 -0
  65. azure/ai/evaluation/_exceptions.py +107 -0
  66. azure/ai/evaluation/_http_utils.py +395 -0
  67. azure/ai/evaluation/_model_configurations.py +27 -0
  68. azure/ai/evaluation/_user_agent.py +6 -0
  69. azure/ai/evaluation/_version.py +5 -0
  70. azure/ai/evaluation/py.typed +0 -0
  71. azure/ai/evaluation/simulator/__init__.py +15 -0
  72. azure/ai/evaluation/simulator/_adversarial_scenario.py +27 -0
  73. azure/ai/evaluation/simulator/_adversarial_simulator.py +450 -0
  74. azure/ai/evaluation/simulator/_constants.py +17 -0
  75. azure/ai/evaluation/simulator/_conversation/__init__.py +315 -0
  76. azure/ai/evaluation/simulator/_conversation/_conversation.py +178 -0
  77. azure/ai/evaluation/simulator/_conversation/constants.py +30 -0
  78. azure/ai/evaluation/simulator/_direct_attack_simulator.py +252 -0
  79. azure/ai/evaluation/simulator/_helpers/__init__.py +4 -0
  80. azure/ai/evaluation/simulator/_helpers/_language_suffix_mapping.py +17 -0
  81. azure/ai/evaluation/simulator/_helpers/_simulator_data_classes.py +93 -0
  82. azure/ai/evaluation/simulator/_indirect_attack_simulator.py +207 -0
  83. azure/ai/evaluation/simulator/_model_tools/__init__.py +23 -0
  84. azure/ai/evaluation/simulator/_model_tools/_identity_manager.py +147 -0
  85. azure/ai/evaluation/simulator/_model_tools/_proxy_completion_model.py +228 -0
  86. azure/ai/evaluation/simulator/_model_tools/_rai_client.py +157 -0
  87. azure/ai/evaluation/simulator/_model_tools/_template_handler.py +157 -0
  88. azure/ai/evaluation/simulator/_model_tools/models.py +616 -0
  89. azure/ai/evaluation/simulator/_prompty/task_query_response.prompty +69 -0
  90. azure/ai/evaluation/simulator/_prompty/task_simulate.prompty +36 -0
  91. azure/ai/evaluation/simulator/_tracing.py +92 -0
  92. azure/ai/evaluation/simulator/_utils.py +111 -0
  93. azure/ai/evaluation/simulator/simulator.py +579 -0
  94. azure_ai_evaluation-1.0.0b1.dist-info/METADATA +377 -0
  95. azure_ai_evaluation-1.0.0b1.dist-info/RECORD +97 -0
  96. {azure_ai_evaluation-0.0.0b0.dist-info → azure_ai_evaluation-1.0.0b1.dist-info}/WHEEL +1 -1
  97. azure_ai_evaluation-1.0.0b1.dist-info/top_level.txt +1 -0
  98. azure_ai_evaluation-0.0.0b0.dist-info/METADATA +0 -7
  99. azure_ai_evaluation-0.0.0b0.dist-info/RECORD +0 -4
  100. azure_ai_evaluation-0.0.0b0.dist-info/top_level.txt +0 -1
@@ -0,0 +1,174 @@
1
+ # ---------------------------------------------------------
2
+ # Copyright (c) Microsoft Corporation. All rights reserved.
3
+ # ---------------------------------------------------------
4
+
5
+ import functools
6
+ import inspect
7
+ import json
8
+ import logging
9
+ from typing import Callable, Dict
10
+
11
+ import pandas as pd
12
+
13
+ from promptflow._sdk.entities._flows import FlexFlow as flex_flow
14
+ from promptflow._sdk.entities._flows import Prompty as prompty_sdk
15
+ from promptflow._sdk.entities._flows.dag import Flow as dag_flow
16
+ from promptflow.client import PFClient
17
+ from promptflow.core import Prompty as prompty_core
18
+
19
+ from ..._user_agent import USER_AGENT
20
+ from .._utils import _trace_destination_from_project_scope
21
+
22
+ LOGGER = logging.getLogger(__name__)
23
+
24
+
25
+ def _get_evaluator_type(evaluator: Dict[str, Callable]):
26
+ """
27
+ Get evaluator type for telemetry.
28
+
29
+ :param evaluator: The evaluator object
30
+ :type evaluator: Dict[str, Callable]
31
+ :return: The evaluator type. Possible values are "built-in", "custom", and "content-safety".
32
+ :rtype: str
33
+ """
34
+ built_in = False
35
+ content_safety = False
36
+
37
+ module = inspect.getmodule(evaluator)
38
+ built_in = module and module.__name__.startswith("azure.ai.evaluation._evaluators.")
39
+ if built_in:
40
+ content_safety = module.__name__.startswith("azure.ai.evaluation._evaluators._content_safety")
41
+
42
+ if content_safety:
43
+ return "content-safety"
44
+ if built_in:
45
+ return "built-in"
46
+ return "custom"
47
+
48
+
49
+ def _get_evaluator_properties(evaluator, evaluator_name):
50
+ """
51
+ Get evaluator properties for telemetry.
52
+
53
+ :param: evaluator: The evaluator object
54
+ :param: evaluator_name: The alias for the evaluator
55
+ :type: str
56
+ :raises Exception: If the evaluator properties cannot be retrieved
57
+ :return: A dictionary containing the evaluator properties, including
58
+ "name": A name for the evaluator
59
+ "pf_type": The promptflow type being used
60
+ "type": The evaluator type. Accepted values are "built-in", "custom", and "content-safety"
61
+ "alias": The alias for the evaluator. Defaults to an empty string.
62
+ :rtype: Dict[str, str]
63
+ """
64
+
65
+ try:
66
+ # Cover flex flow and prompty based evaluator
67
+ if isinstance(evaluator, (prompty_sdk, prompty_core, flex_flow)):
68
+ name = evaluator.name
69
+ pf_type = evaluator.__class__.__name__
70
+ # Cover dag flow based evaluator
71
+ elif isinstance(evaluator, dag_flow):
72
+ name = evaluator.name
73
+ pf_type = "DagFlow"
74
+ elif inspect.isfunction(evaluator):
75
+ name = evaluator.__name__
76
+ pf_type = flex_flow.__name__
77
+ elif hasattr(evaluator, "__class__") and callable(evaluator):
78
+ name = evaluator.__class__.__name__
79
+ pf_type = flex_flow.__name__
80
+ else:
81
+ # fallback option
82
+ name = str(evaluator)
83
+ pf_type = "Unknown"
84
+ except Exception as e: # pylint: disable=broad-exception-caught
85
+ LOGGER.debug(f"Failed to get evaluator properties: {e}")
86
+ name = str(evaluator)
87
+ pf_type = "Unknown"
88
+
89
+ return {
90
+ "name": name,
91
+ "pf_type": pf_type,
92
+ "type": _get_evaluator_type(evaluator),
93
+ "alias": evaluator_name if evaluator_name else "",
94
+ }
95
+
96
+
97
+ # cspell:ignore isna
98
+ def log_evaluate_activity(func) -> None:
99
+ """Decorator to log evaluate activity
100
+
101
+ :param func: The function to be decorated
102
+ :type func: Callable
103
+ """
104
+
105
+ @functools.wraps(func)
106
+ def wrapper(*args, **kwargs) -> Callable:
107
+ from promptflow._sdk._telemetry import ActivityType, log_activity
108
+ from promptflow._sdk._telemetry.telemetry import get_telemetry_logger
109
+
110
+ evaluators = kwargs.get("evaluators", [])
111
+ azure_ai_project = kwargs.get("azure_ai_project", None)
112
+
113
+ pf_client = PFClient(
114
+ config=(
115
+ {"trace.destination": _trace_destination_from_project_scope(azure_ai_project)}
116
+ if azure_ai_project
117
+ else None
118
+ ),
119
+ user_agent=USER_AGENT,
120
+ )
121
+
122
+ track_in_cloud = bool(pf_client._config.get_trace_destination())
123
+ evaluate_target = bool(kwargs.get("target", None))
124
+ evaluator_config = bool(kwargs.get("evaluator_config", None))
125
+ custom_dimensions = {
126
+ "track_in_cloud": track_in_cloud,
127
+ "evaluate_target": evaluate_target,
128
+ "evaluator_config": evaluator_config,
129
+ }
130
+
131
+ with log_activity(
132
+ get_telemetry_logger(),
133
+ "pf.evals.evaluate",
134
+ activity_type=ActivityType.PUBLICAPI,
135
+ user_agent=USER_AGENT,
136
+ custom_dimensions=custom_dimensions,
137
+ ):
138
+ result = func(*args, **kwargs)
139
+
140
+ try:
141
+ evaluators_info = []
142
+ for evaluator_name, evaluator in evaluators.items():
143
+ evaluator_info = _get_evaluator_properties(evaluator, evaluator_name)
144
+ try:
145
+ evaluator_df = pd.DataFrame(result.get("rows", [])).filter(
146
+ like=f"outputs.{evaluator_name}", axis=1
147
+ )
148
+
149
+ failed_rows = (
150
+ evaluator_df.shape[0] if evaluator_df.empty else int(evaluator_df.isna().any(axis=1).sum())
151
+ )
152
+ total_rows = evaluator_df.shape[0]
153
+
154
+ evaluator_info["failed_rows"] = failed_rows
155
+ evaluator_info["total_rows"] = total_rows
156
+ except Exception as e: # pylint: disable=broad-exception-caught
157
+ LOGGER.debug(f"Failed to collect evaluate failed row info for {evaluator_name}: {e}")
158
+ evaluators_info.append(evaluator_info)
159
+
160
+ custom_dimensions = {"evaluators_info": json.dumps(evaluators_info)}
161
+ with log_activity(
162
+ get_telemetry_logger(),
163
+ "pf.evals.evaluate_usage_info",
164
+ activity_type=ActivityType.PUBLICAPI,
165
+ user_agent=USER_AGENT,
166
+ custom_dimensions=custom_dimensions,
167
+ ):
168
+ pass
169
+ except Exception as e: # pylint: disable=broad-exception-caught
170
+ LOGGER.debug(f"Failed to collect evaluate usage info: {e}")
171
+
172
+ return result
173
+
174
+ return wrapper
@@ -0,0 +1,237 @@
1
+ # ---------------------------------------------------------
2
+ # Copyright (c) Microsoft Corporation. All rights reserved.
3
+ # ---------------------------------------------------------
4
+ import json
5
+ import logging
6
+ import os
7
+ import re
8
+ import tempfile
9
+ from collections import namedtuple
10
+ from pathlib import Path
11
+
12
+ import pandas as pd
13
+
14
+ from azure.ai.evaluation._constants import DEFAULT_EVALUATION_RESULTS_FILE_NAME, Prefixes
15
+ from azure.ai.evaluation._exceptions import EvaluationException, ErrorBlame, ErrorCategory, ErrorTarget
16
+ from azure.ai.evaluation._evaluate._eval_run import EvalRun
17
+
18
+ LOGGER = logging.getLogger(__name__)
19
+
20
+ AZURE_WORKSPACE_REGEX_FORMAT = (
21
+ "^azureml:[/]{1,2}subscriptions/([^/]+)/resource(groups|Groups)/([^/]+)"
22
+ "(/providers/Microsoft.MachineLearningServices)?/workspaces/([^/]+)$"
23
+ )
24
+
25
+ AzureMLWorkspaceTriad = namedtuple("AzureMLWorkspace", ["subscription_id", "resource_group_name", "workspace_name"])
26
+
27
+
28
+ def is_none(value):
29
+ return value is None or str(value).lower() == "none"
30
+
31
+
32
+ def extract_workspace_triad_from_trace_provider(trace_provider: str): # pylint: disable=name-too-long
33
+ match = re.match(AZURE_WORKSPACE_REGEX_FORMAT, trace_provider)
34
+ if not match or len(match.groups()) != 5:
35
+ raise EvaluationException(
36
+ message="Malformed trace provider string, expected azureml://subscriptions/<subscription_id>/"
37
+ "resourceGroups/<resource_group>/providers/Microsoft.MachineLearningServices/"
38
+ f"workspaces/<workspace_name>, got {trace_provider}",
39
+ internal_message="Malformed trace provider string, expected azureml://subscriptions/<subscription_id>/"
40
+ "resourceGroups/<resource_group>/providers/Microsoft.MachineLearningServices/"
41
+ "workspaces/<workspace_name>,",
42
+ target=ErrorTarget.UNKNOWN,
43
+ category=ErrorCategory.INVALID_VALUE,
44
+ blame=ErrorBlame.UNKNOWN,
45
+ )
46
+ subscription_id = match.group(1)
47
+ resource_group_name = match.group(3)
48
+ workspace_name = match.group(5)
49
+ return AzureMLWorkspaceTriad(subscription_id, resource_group_name, workspace_name)
50
+
51
+
52
+ def load_jsonl(path):
53
+ with open(path, "r", encoding="utf-8") as f:
54
+ return [json.loads(line) for line in f.readlines()]
55
+
56
+
57
+ def _azure_pf_client_and_triad(trace_destination):
58
+ from promptflow.azure._cli._utils import _get_azure_pf_client
59
+
60
+ ws_triad = extract_workspace_triad_from_trace_provider(trace_destination)
61
+ azure_pf_client = _get_azure_pf_client(
62
+ subscription_id=ws_triad.subscription_id,
63
+ resource_group=ws_triad.resource_group_name,
64
+ workspace_name=ws_triad.workspace_name,
65
+ )
66
+
67
+ return azure_pf_client, ws_triad
68
+
69
+
70
+ def _log_metrics_and_instance_results(
71
+ metrics,
72
+ instance_results,
73
+ trace_destination,
74
+ run,
75
+ evaluation_name,
76
+ ) -> str:
77
+ if trace_destination is None:
78
+ LOGGER.error("Unable to log traces as trace destination was not defined.")
79
+ return None
80
+
81
+ azure_pf_client, ws_triad = _azure_pf_client_and_triad(trace_destination)
82
+ tracking_uri = azure_pf_client.ml_client.workspaces.get(ws_triad.workspace_name).mlflow_tracking_uri
83
+
84
+ # Adding line_number as index column this is needed by UI to form link to individual instance run
85
+ instance_results["line_number"] = instance_results.index.values
86
+
87
+ with EvalRun(
88
+ run_name=run.name if run is not None else evaluation_name,
89
+ tracking_uri=tracking_uri,
90
+ subscription_id=ws_triad.subscription_id,
91
+ group_name=ws_triad.resource_group_name,
92
+ workspace_name=ws_triad.workspace_name,
93
+ ml_client=azure_pf_client.ml_client,
94
+ promptflow_run=run,
95
+ ) as ev_run:
96
+
97
+ artifact_name = EvalRun.EVALUATION_ARTIFACT if run else EvalRun.EVALUATION_ARTIFACT_DUMMY_RUN
98
+
99
+ with tempfile.TemporaryDirectory() as tmpdir:
100
+ tmp_path = os.path.join(tmpdir, artifact_name)
101
+
102
+ with open(tmp_path, "w", encoding="utf-8") as f:
103
+ f.write(instance_results.to_json(orient="records", lines=True))
104
+
105
+ ev_run.log_artifact(tmpdir, artifact_name)
106
+
107
+ # Using mlflow to create a dummy run since once created via PF show traces of dummy run in UI.
108
+ # Those traces can be confusing.
109
+ # adding these properties to avoid showing traces if a dummy run is created.
110
+ # We are doing that only for the pure evaluation runs.
111
+ if run is None:
112
+ ev_run.write_properties_to_run_history(
113
+ properties={
114
+ "_azureml.evaluation_run": "azure-ai-generative-parent",
115
+ "_azureml.evaluate_artifacts": json.dumps([{"path": artifact_name, "type": "table"}]),
116
+ "isEvaluatorRun": "true",
117
+ }
118
+ )
119
+
120
+ for metric_name, metric_value in metrics.items():
121
+ ev_run.log_metric(metric_name, metric_value)
122
+
123
+ evaluation_id = ev_run.info.run_name if run is not None else ev_run.info.run_id
124
+ return _get_ai_studio_url(trace_destination=trace_destination, evaluation_id=evaluation_id)
125
+
126
+
127
+ def _get_ai_studio_url(trace_destination: str, evaluation_id: str) -> str:
128
+ ws_triad = extract_workspace_triad_from_trace_provider(trace_destination)
129
+ studio_base_url = os.getenv("AI_STUDIO_BASE_URL", "https://ai.azure.com")
130
+
131
+ studio_url = (
132
+ f"{studio_base_url}/build/evaluation/{evaluation_id}?wsid=/subscriptions/{ws_triad.subscription_id}"
133
+ f"/resourceGroups/{ws_triad.resource_group_name}/providers/Microsoft.MachineLearningServices/"
134
+ f"workspaces/{ws_triad.workspace_name}"
135
+ )
136
+
137
+ return studio_url
138
+
139
+
140
+ def _trace_destination_from_project_scope(project_scope: dict) -> str:
141
+ subscription_id = project_scope["subscription_id"]
142
+ resource_group_name = project_scope["resource_group_name"]
143
+ workspace_name = project_scope["project_name"]
144
+
145
+ trace_destination = (
146
+ f"azureml://subscriptions/{subscription_id}/resourceGroups/{resource_group_name}/"
147
+ f"providers/Microsoft.MachineLearningServices/workspaces/{workspace_name}"
148
+ )
149
+
150
+ return trace_destination
151
+
152
+
153
+ def _write_output(path, data_dict):
154
+ p = Path(path)
155
+ if os.path.isdir(path):
156
+ p = p / DEFAULT_EVALUATION_RESULTS_FILE_NAME
157
+
158
+ with open(p, "w") as f:
159
+ json.dump(data_dict, f)
160
+
161
+
162
+ def _apply_column_mapping(source_df: pd.DataFrame, mapping_config: dict, inplace: bool = False) -> pd.DataFrame:
163
+ """
164
+ Apply column mapping to source_df based on mapping_config.
165
+
166
+ This function is used for pre-validation of input data for evaluators
167
+ :param source_df: the data frame to be changed.
168
+ :type source_df: pd.DataFrame
169
+ :param mapping_config: The configuration, containing column mapping.
170
+ :type mapping_config: dict.
171
+ :param inplace: If true, the source_df will be changed inplace.
172
+ :type inplace: bool
173
+ :return: The modified data frame.
174
+ """
175
+ result_df = source_df
176
+
177
+ if mapping_config:
178
+ column_mapping = {}
179
+ columns_to_drop = set()
180
+ pattern_prefix = "data."
181
+ run_outputs_prefix = "run.outputs."
182
+
183
+ for map_to_key, map_value in mapping_config.items():
184
+ match = re.search(r"^\${([^{}]+)}$", map_value)
185
+ if match is not None:
186
+ pattern = match.group(1)
187
+ if pattern.startswith(pattern_prefix):
188
+ map_from_key = pattern[len(pattern_prefix) :]
189
+ elif pattern.startswith(run_outputs_prefix):
190
+ # Target-generated columns always starts from .outputs.
191
+ map_from_key = f"{Prefixes.TSG_OUTPUTS}{pattern[len(run_outputs_prefix) :]}"
192
+ # if we are not renaming anything, skip.
193
+ if map_from_key == map_to_key:
194
+ continue
195
+ # If column needs to be mapped to already existing column, we will add it
196
+ # to the drop list.
197
+ if map_to_key in source_df.columns:
198
+ columns_to_drop.add(map_to_key)
199
+ column_mapping[map_from_key] = map_to_key
200
+ # If we map column to another one, which is already present in the data
201
+ # set and the letter also needs to be mapped, we will not drop it, but map
202
+ # instead.
203
+ columns_to_drop = columns_to_drop - set(column_mapping.keys())
204
+ result_df = source_df.drop(columns=columns_to_drop, inplace=inplace)
205
+ result_df.rename(columns=column_mapping, inplace=True)
206
+
207
+ return result_df
208
+
209
+
210
+ def _has_aggregator(evaluator):
211
+ return hasattr(evaluator, "__aggregate__")
212
+
213
+
214
+ def get_int_env_var(env_var_name, default_value=None):
215
+ """
216
+ The function `get_int_env_var` retrieves an integer environment variable value, with an optional
217
+ default value if the variable is not set or cannot be converted to an integer.
218
+
219
+ :param env_var_name: The name of the environment variable you want to retrieve the value of
220
+ :param default_value: The default value is the value that will be returned if the environment
221
+ variable is not found or if it cannot be converted to an integer
222
+ :return: an integer value.
223
+ """
224
+ try:
225
+ return int(os.environ.get(env_var_name, default_value))
226
+ except Exception:
227
+ return default_value
228
+
229
+
230
+ def set_event_loop_policy():
231
+ import asyncio
232
+ import platform
233
+
234
+ if platform.system().lower() == "windows":
235
+ # Reference: https://stackoverflow.com/questions/45600579/asyncio-event-loop-is-closed-when-getting-loop
236
+ # On Windows seems to be a problem with EventLoopPolicy, use this snippet to work around it
237
+ asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())
@@ -0,0 +1,3 @@
1
+ # ---------------------------------------------------------
2
+ # Copyright (c) Microsoft Corporation. All rights reserved.
3
+ # ---------------------------------------------------------
@@ -0,0 +1,9 @@
1
+ # ---------------------------------------------------------
2
+ # Copyright (c) Microsoft Corporation. All rights reserved.
3
+ # ---------------------------------------------------------
4
+
5
+ from ._bleu import BleuScoreEvaluator
6
+
7
+ __all__ = [
8
+ "BleuScoreEvaluator",
9
+ ]
@@ -0,0 +1,73 @@
1
+ # ---------------------------------------------------------
2
+ # Copyright (c) Microsoft Corporation. All rights reserved.
3
+ # ---------------------------------------------------------
4
+ from nltk.translate.bleu_score import SmoothingFunction, sentence_bleu
5
+
6
+ from promptflow._utils.async_utils import async_run_allowing_running_loop
7
+ from azure.ai.evaluation._common.utils import nltk_tokenize
8
+
9
+
10
+ class _AsyncBleuScoreEvaluator:
11
+ def __init__(self):
12
+ pass
13
+
14
+ async def __call__(self, *, response: str, ground_truth: str, **kwargs):
15
+ reference_tokens = nltk_tokenize(ground_truth)
16
+ hypothesis_tokens = nltk_tokenize(response)
17
+
18
+ # NIST Smoothing
19
+ smoothing_function = SmoothingFunction().method4
20
+ score = sentence_bleu([reference_tokens], hypothesis_tokens, smoothing_function=smoothing_function)
21
+
22
+ return {
23
+ "bleu_score": score,
24
+ }
25
+
26
+
27
+ class BleuScoreEvaluator:
28
+ """
29
+ Evaluator that computes the BLEU Score between two strings.
30
+
31
+ BLEU (Bilingual Evaluation Understudy) score is commonly used in natural language processing (NLP) and machine
32
+ translation. It is widely used in text summarization and text generation use cases. It evaluates how closely the
33
+ generated text matches the reference text. The BLEU score ranges from 0 to 1, with higher scores indicating
34
+ better quality.
35
+
36
+ **Usage**
37
+
38
+ .. code-block:: python
39
+
40
+ eval_fn = BleuScoreEvaluator()
41
+ result = eval_fn(
42
+ response="Tokyo is the capital of Japan.",
43
+ ground_truth="The capital of Japan is Tokyo.")
44
+
45
+ **Output format**
46
+
47
+ .. code-block:: python
48
+
49
+ {
50
+ "bleu_score": 0.22
51
+ }
52
+ """
53
+
54
+ def __init__(self):
55
+ self._async_evaluator = _AsyncBleuScoreEvaluator()
56
+
57
+ def __call__(self, *, response: str, ground_truth: str, **kwargs):
58
+ """
59
+ Evaluate the BLEU score between the response and the ground truth.
60
+
61
+ :keyword response: The response to be evaluated.
62
+ :paramtype response: str
63
+ :keyword ground_truth: The ground truth to be compared against.
64
+ :paramtype ground_truth: str
65
+ :return: The BLEU score.
66
+ :rtype: dict
67
+ """
68
+ return async_run_allowing_running_loop(
69
+ self._async_evaluator, response=response, ground_truth=ground_truth, **kwargs
70
+ )
71
+
72
+ def _to_async(self):
73
+ return self._async_evaluator
@@ -0,0 +1,9 @@
1
+ # ---------------------------------------------------------
2
+ # Copyright (c) Microsoft Corporation. All rights reserved.
3
+ # ---------------------------------------------------------
4
+
5
+ from ._chat import ChatEvaluator
6
+
7
+ __all__ = [
8
+ "ChatEvaluator",
9
+ ]