azure-ai-evaluation 0.0.0b0__py3-none-any.whl → 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (122) hide show
  1. azure/ai/evaluation/__init__.py +82 -0
  2. azure/ai/evaluation/_common/__init__.py +16 -0
  3. azure/ai/evaluation/_common/_experimental.py +172 -0
  4. azure/ai/evaluation/_common/constants.py +72 -0
  5. azure/ai/evaluation/_common/math.py +89 -0
  6. azure/ai/evaluation/_common/rai_service.py +632 -0
  7. azure/ai/evaluation/_common/utils.py +445 -0
  8. azure/ai/evaluation/_constants.py +72 -0
  9. azure/ai/evaluation/_evaluate/__init__.py +3 -0
  10. azure/ai/evaluation/_evaluate/_batch_run/__init__.py +9 -0
  11. azure/ai/evaluation/_evaluate/_batch_run/code_client.py +188 -0
  12. azure/ai/evaluation/_evaluate/_batch_run/eval_run_context.py +89 -0
  13. azure/ai/evaluation/_evaluate/_batch_run/proxy_client.py +99 -0
  14. azure/ai/evaluation/_evaluate/_batch_run/target_run_context.py +46 -0
  15. azure/ai/evaluation/_evaluate/_eval_run.py +571 -0
  16. azure/ai/evaluation/_evaluate/_evaluate.py +850 -0
  17. azure/ai/evaluation/_evaluate/_telemetry/__init__.py +179 -0
  18. azure/ai/evaluation/_evaluate/_utils.py +298 -0
  19. azure/ai/evaluation/_evaluators/__init__.py +3 -0
  20. azure/ai/evaluation/_evaluators/_bleu/__init__.py +9 -0
  21. azure/ai/evaluation/_evaluators/_bleu/_bleu.py +72 -0
  22. azure/ai/evaluation/_evaluators/_coherence/__init__.py +7 -0
  23. azure/ai/evaluation/_evaluators/_coherence/_coherence.py +107 -0
  24. azure/ai/evaluation/_evaluators/_coherence/coherence.prompty +99 -0
  25. azure/ai/evaluation/_evaluators/_common/__init__.py +13 -0
  26. azure/ai/evaluation/_evaluators/_common/_base_eval.py +344 -0
  27. azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +88 -0
  28. azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py +133 -0
  29. azure/ai/evaluation/_evaluators/_content_safety/__init__.py +17 -0
  30. azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py +144 -0
  31. azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py +129 -0
  32. azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py +123 -0
  33. azure/ai/evaluation/_evaluators/_content_safety/_sexual.py +125 -0
  34. azure/ai/evaluation/_evaluators/_content_safety/_violence.py +126 -0
  35. azure/ai/evaluation/_evaluators/_eci/__init__.py +0 -0
  36. azure/ai/evaluation/_evaluators/_eci/_eci.py +89 -0
  37. azure/ai/evaluation/_evaluators/_f1_score/__init__.py +9 -0
  38. azure/ai/evaluation/_evaluators/_f1_score/_f1_score.py +157 -0
  39. azure/ai/evaluation/_evaluators/_fluency/__init__.py +9 -0
  40. azure/ai/evaluation/_evaluators/_fluency/_fluency.py +104 -0
  41. azure/ai/evaluation/_evaluators/_fluency/fluency.prompty +86 -0
  42. azure/ai/evaluation/_evaluators/_gleu/__init__.py +9 -0
  43. azure/ai/evaluation/_evaluators/_gleu/_gleu.py +69 -0
  44. azure/ai/evaluation/_evaluators/_groundedness/__init__.py +9 -0
  45. azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +144 -0
  46. azure/ai/evaluation/_evaluators/_groundedness/groundedness_with_query.prompty +113 -0
  47. azure/ai/evaluation/_evaluators/_groundedness/groundedness_without_query.prompty +99 -0
  48. azure/ai/evaluation/_evaluators/_meteor/__init__.py +9 -0
  49. azure/ai/evaluation/_evaluators/_meteor/_meteor.py +90 -0
  50. azure/ai/evaluation/_evaluators/_multimodal/__init__.py +20 -0
  51. azure/ai/evaluation/_evaluators/_multimodal/_content_safety_multimodal.py +132 -0
  52. azure/ai/evaluation/_evaluators/_multimodal/_content_safety_multimodal_base.py +55 -0
  53. azure/ai/evaluation/_evaluators/_multimodal/_hate_unfairness.py +100 -0
  54. azure/ai/evaluation/_evaluators/_multimodal/_protected_material.py +124 -0
  55. azure/ai/evaluation/_evaluators/_multimodal/_self_harm.py +100 -0
  56. azure/ai/evaluation/_evaluators/_multimodal/_sexual.py +100 -0
  57. azure/ai/evaluation/_evaluators/_multimodal/_violence.py +100 -0
  58. azure/ai/evaluation/_evaluators/_protected_material/__init__.py +5 -0
  59. azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py +113 -0
  60. azure/ai/evaluation/_evaluators/_qa/__init__.py +9 -0
  61. azure/ai/evaluation/_evaluators/_qa/_qa.py +93 -0
  62. azure/ai/evaluation/_evaluators/_relevance/__init__.py +9 -0
  63. azure/ai/evaluation/_evaluators/_relevance/_relevance.py +114 -0
  64. azure/ai/evaluation/_evaluators/_relevance/relevance.prompty +100 -0
  65. azure/ai/evaluation/_evaluators/_retrieval/__init__.py +9 -0
  66. azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py +112 -0
  67. azure/ai/evaluation/_evaluators/_retrieval/retrieval.prompty +93 -0
  68. azure/ai/evaluation/_evaluators/_rouge/__init__.py +10 -0
  69. azure/ai/evaluation/_evaluators/_rouge/_rouge.py +98 -0
  70. azure/ai/evaluation/_evaluators/_service_groundedness/__init__.py +9 -0
  71. azure/ai/evaluation/_evaluators/_service_groundedness/_service_groundedness.py +148 -0
  72. azure/ai/evaluation/_evaluators/_similarity/__init__.py +9 -0
  73. azure/ai/evaluation/_evaluators/_similarity/_similarity.py +140 -0
  74. azure/ai/evaluation/_evaluators/_similarity/similarity.prompty +66 -0
  75. azure/ai/evaluation/_evaluators/_xpia/__init__.py +5 -0
  76. azure/ai/evaluation/_evaluators/_xpia/xpia.py +125 -0
  77. azure/ai/evaluation/_exceptions.py +128 -0
  78. azure/ai/evaluation/_http_utils.py +466 -0
  79. azure/ai/evaluation/_model_configurations.py +123 -0
  80. azure/ai/evaluation/_user_agent.py +6 -0
  81. azure/ai/evaluation/_vendor/__init__.py +3 -0
  82. azure/ai/evaluation/_vendor/rouge_score/__init__.py +14 -0
  83. azure/ai/evaluation/_vendor/rouge_score/rouge_scorer.py +328 -0
  84. azure/ai/evaluation/_vendor/rouge_score/scoring.py +63 -0
  85. azure/ai/evaluation/_vendor/rouge_score/tokenize.py +63 -0
  86. azure/ai/evaluation/_vendor/rouge_score/tokenizers.py +53 -0
  87. azure/ai/evaluation/_version.py +5 -0
  88. azure/ai/evaluation/py.typed +0 -0
  89. azure/ai/evaluation/simulator/__init__.py +16 -0
  90. azure/ai/evaluation/simulator/_adversarial_scenario.py +46 -0
  91. azure/ai/evaluation/simulator/_adversarial_simulator.py +471 -0
  92. azure/ai/evaluation/simulator/_constants.py +27 -0
  93. azure/ai/evaluation/simulator/_conversation/__init__.py +316 -0
  94. azure/ai/evaluation/simulator/_conversation/_conversation.py +178 -0
  95. azure/ai/evaluation/simulator/_conversation/constants.py +30 -0
  96. azure/ai/evaluation/simulator/_data_sources/__init__.py +3 -0
  97. azure/ai/evaluation/simulator/_data_sources/grounding.json +1150 -0
  98. azure/ai/evaluation/simulator/_direct_attack_simulator.py +218 -0
  99. azure/ai/evaluation/simulator/_helpers/__init__.py +4 -0
  100. azure/ai/evaluation/simulator/_helpers/_language_suffix_mapping.py +17 -0
  101. azure/ai/evaluation/simulator/_helpers/_simulator_data_classes.py +96 -0
  102. azure/ai/evaluation/simulator/_indirect_attack_simulator.py +220 -0
  103. azure/ai/evaluation/simulator/_model_tools/__init__.py +23 -0
  104. azure/ai/evaluation/simulator/_model_tools/_identity_manager.py +195 -0
  105. azure/ai/evaluation/simulator/_model_tools/_proxy_completion_model.py +244 -0
  106. azure/ai/evaluation/simulator/_model_tools/_rai_client.py +168 -0
  107. azure/ai/evaluation/simulator/_model_tools/_template_handler.py +201 -0
  108. azure/ai/evaluation/simulator/_model_tools/models.py +614 -0
  109. azure/ai/evaluation/simulator/_prompty/__init__.py +0 -0
  110. azure/ai/evaluation/simulator/_prompty/task_query_response.prompty +65 -0
  111. azure/ai/evaluation/simulator/_prompty/task_simulate.prompty +37 -0
  112. azure/ai/evaluation/simulator/_simulator.py +716 -0
  113. azure/ai/evaluation/simulator/_tracing.py +89 -0
  114. azure/ai/evaluation/simulator/_utils.py +132 -0
  115. azure_ai_evaluation-1.0.0.dist-info/METADATA +595 -0
  116. azure_ai_evaluation-1.0.0.dist-info/NOTICE.txt +70 -0
  117. azure_ai_evaluation-1.0.0.dist-info/RECORD +119 -0
  118. {azure_ai_evaluation-0.0.0b0.dist-info → azure_ai_evaluation-1.0.0.dist-info}/WHEEL +1 -1
  119. azure_ai_evaluation-1.0.0.dist-info/top_level.txt +1 -0
  120. azure_ai_evaluation-0.0.0b0.dist-info/METADATA +0 -7
  121. azure_ai_evaluation-0.0.0b0.dist-info/RECORD +0 -4
  122. azure_ai_evaluation-0.0.0b0.dist-info/top_level.txt +0 -1
@@ -0,0 +1,179 @@
1
+ # ---------------------------------------------------------
2
+ # Copyright (c) Microsoft Corporation. All rights reserved.
3
+ # ---------------------------------------------------------
4
+
5
+ import functools
6
+ import inspect
7
+ import json
8
+ import logging
9
+ from typing import Callable, Dict, Literal, Optional, Union, cast
10
+
11
+ import pandas as pd
12
+ from promptflow._sdk.entities._flows import FlexFlow as flex_flow
13
+ from promptflow._sdk.entities._flows import Prompty as prompty_sdk
14
+ from promptflow._sdk.entities._flows.dag import Flow as dag_flow
15
+ from promptflow.client import PFClient
16
+ from promptflow.core import Prompty as prompty_core
17
+ from typing_extensions import ParamSpec
18
+
19
+ from azure.ai.evaluation._model_configurations import AzureAIProject, EvaluationResult
20
+
21
+ from ..._user_agent import USER_AGENT
22
+ from .._utils import _trace_destination_from_project_scope
23
+
24
+ LOGGER = logging.getLogger(__name__)
25
+
26
+ P = ParamSpec("P")
27
+
28
+
29
+ def _get_evaluator_type(evaluator: Dict[str, Callable]) -> Literal["content-safety", "built-in", "custom"]:
30
+ """
31
+ Get evaluator type for telemetry.
32
+
33
+ :param evaluator: The evaluator object
34
+ :type evaluator: Dict[str, Callable]
35
+ :return: The evaluator type. Possible values are "built-in", "custom", and "content-safety".
36
+ :rtype: Literal["content-safety", "built-in", "custom"]
37
+ """
38
+ module = inspect.getmodule(evaluator)
39
+ module_name = module.__name__ if module else ""
40
+
41
+ built_in = module_name.startswith("azure.ai.evaluation._evaluators.")
42
+ content_safety = built_in and module_name.startswith("azure.ai.evaluation._evaluators._content_safety")
43
+
44
+ if content_safety:
45
+ return "content-safety"
46
+ if built_in:
47
+ return "built-in"
48
+ return "custom"
49
+
50
+
51
+ def _get_evaluator_properties(evaluator, evaluator_name):
52
+ """
53
+ Get evaluator properties for telemetry.
54
+
55
+ :param: evaluator: The evaluator object
56
+ :param: evaluator_name: The alias for the evaluator
57
+ :type: str
58
+ :raises Exception: If the evaluator properties cannot be retrieved
59
+ :return: A dictionary containing the evaluator properties, including
60
+ "name": A name for the evaluator
61
+ "pf_type": The promptflow type being used
62
+ "type": The evaluator type. Accepted values are "built-in", "custom", and "content-safety"
63
+ "alias": The alias for the evaluator. Defaults to an empty string.
64
+ :rtype: Dict[str, str]
65
+ """
66
+
67
+ try:
68
+ # Cover flex flow and prompty based evaluator
69
+ if isinstance(evaluator, (prompty_sdk, prompty_core, flex_flow)):
70
+ name = evaluator.name
71
+ pf_type = evaluator.__class__.__name__
72
+ # Cover dag flow based evaluator
73
+ elif isinstance(evaluator, dag_flow):
74
+ name = evaluator.name
75
+ pf_type = "DagFlow"
76
+ elif inspect.isfunction(evaluator):
77
+ name = evaluator.__name__
78
+ pf_type = flex_flow.__name__
79
+ elif hasattr(evaluator, "__class__") and callable(evaluator):
80
+ name = evaluator.__class__.__name__
81
+ pf_type = flex_flow.__name__
82
+ else:
83
+ # fallback option
84
+ name = str(evaluator)
85
+ pf_type = "Unknown"
86
+ except Exception as e: # pylint: disable=broad-exception-caught
87
+ LOGGER.debug("Failed to get evaluator properties: %s", e)
88
+ name = str(evaluator)
89
+ pf_type = "Unknown"
90
+
91
+ return {
92
+ "name": name,
93
+ "pf_type": pf_type,
94
+ "type": _get_evaluator_type(evaluator),
95
+ "alias": evaluator_name if evaluator_name else "",
96
+ }
97
+
98
+
99
+ # cspell:ignore isna
100
+ def log_evaluate_activity(func: Callable[P, EvaluationResult]) -> Callable[P, EvaluationResult]:
101
+ """Decorator to log evaluate activity
102
+
103
+ :param func: The function to be decorated
104
+ :type func: Callable
105
+ :returns: The decorated function
106
+ :rtype: Callable[P, EvaluationResult]
107
+ """
108
+
109
+ @functools.wraps(func)
110
+ def wrapper(*args: P.args, **kwargs: P.kwargs) -> EvaluationResult:
111
+ from promptflow._sdk._telemetry import ActivityType, log_activity
112
+ from promptflow._sdk._telemetry.telemetry import get_telemetry_logger
113
+
114
+ evaluators = cast(Optional[Dict[str, Callable]], kwargs.get("evaluators", {})) or {}
115
+ azure_ai_project = cast(Optional[AzureAIProject], kwargs.get("azure_ai_project", None))
116
+
117
+ pf_client = PFClient(
118
+ config=(
119
+ {"trace.destination": _trace_destination_from_project_scope(azure_ai_project)}
120
+ if azure_ai_project
121
+ else None
122
+ ),
123
+ user_agent=USER_AGENT,
124
+ )
125
+
126
+ trace_destination = pf_client._config.get_trace_destination() # pylint: disable=protected-access
127
+ track_in_cloud = bool(trace_destination) if trace_destination != "none" else False
128
+ evaluate_target = bool(kwargs.get("target", None))
129
+ evaluator_config = bool(kwargs.get("evaluator_config", None))
130
+ custom_dimensions: Dict[str, Union[str, bool]] = {
131
+ "track_in_cloud": track_in_cloud,
132
+ "evaluate_target": evaluate_target,
133
+ "evaluator_config": evaluator_config,
134
+ }
135
+
136
+ with log_activity(
137
+ get_telemetry_logger(),
138
+ "pf.evals.evaluate",
139
+ activity_type=ActivityType.PUBLICAPI,
140
+ user_agent=USER_AGENT,
141
+ custom_dimensions=custom_dimensions,
142
+ ):
143
+ result = func(*args, **kwargs)
144
+
145
+ try:
146
+ evaluators_info = []
147
+ for evaluator_name, evaluator in evaluators.items():
148
+ evaluator_info = _get_evaluator_properties(evaluator, evaluator_name)
149
+ try:
150
+ evaluator_df = pd.DataFrame(result.get("rows", [])).filter(
151
+ like=f"outputs.{evaluator_name}", axis=1
152
+ )
153
+
154
+ failed_rows = (
155
+ evaluator_df.shape[0] if evaluator_df.empty else int(evaluator_df.isna().any(axis=1).sum())
156
+ )
157
+ total_rows = evaluator_df.shape[0]
158
+
159
+ evaluator_info["failed_rows"] = failed_rows
160
+ evaluator_info["total_rows"] = total_rows
161
+ except Exception as e: # pylint: disable=broad-exception-caught
162
+ LOGGER.debug("Failed to collect evaluate failed row info for %s: %s", evaluator_name, e)
163
+ evaluators_info.append(evaluator_info)
164
+
165
+ custom_dimensions = {"evaluators_info": json.dumps(evaluators_info)}
166
+ with log_activity(
167
+ get_telemetry_logger(),
168
+ "pf.evals.evaluate_usage_info",
169
+ activity_type=ActivityType.PUBLICAPI,
170
+ user_agent=USER_AGENT,
171
+ custom_dimensions=custom_dimensions,
172
+ ):
173
+ pass
174
+ except Exception as e: # pylint: disable=broad-exception-caught
175
+ LOGGER.debug("Failed to collect evaluate usage info: %s", e)
176
+
177
+ return result
178
+
179
+ return wrapper
@@ -0,0 +1,298 @@
1
+ # ---------------------------------------------------------
2
+ # Copyright (c) Microsoft Corporation. All rights reserved.
3
+ # ---------------------------------------------------------
4
+ import json
5
+ import logging
6
+ import os
7
+ import re
8
+ import tempfile
9
+ from pathlib import Path
10
+ from typing import Any, Dict, NamedTuple, Optional, Tuple, Union
11
+ import uuid
12
+ import base64
13
+
14
+ import pandas as pd
15
+ from promptflow.client import PFClient
16
+ from promptflow.entities import Run
17
+
18
+ from azure.ai.evaluation._constants import (
19
+ DEFAULT_EVALUATION_RESULTS_FILE_NAME,
20
+ DefaultOpenEncoding,
21
+ EvaluationRunProperties,
22
+ Prefixes,
23
+ )
24
+ from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
25
+ from azure.ai.evaluation._model_configurations import AzureAIProject
26
+
27
+ LOGGER = logging.getLogger(__name__)
28
+
29
+ AZURE_WORKSPACE_REGEX_FORMAT = (
30
+ "^azureml:[/]{1,2}subscriptions/([^/]+)/resource(groups|Groups)/([^/]+)"
31
+ "(/providers/Microsoft.MachineLearningServices)?/workspaces/([^/]+)$"
32
+ )
33
+
34
+
35
+ class AzureMLWorkspace(NamedTuple):
36
+ subscription_id: str
37
+ resource_group_name: str
38
+ workspace_name: str
39
+
40
+
41
+ def is_none(value) -> bool:
42
+ return value is None or str(value).lower() == "none"
43
+
44
+
45
+ def extract_workspace_triad_from_trace_provider( # pylint: disable=name-too-long
46
+ trace_provider: str,
47
+ ) -> AzureMLWorkspace:
48
+ match = re.match(AZURE_WORKSPACE_REGEX_FORMAT, trace_provider)
49
+ if not match or len(match.groups()) != 5:
50
+ raise EvaluationException(
51
+ message="Malformed trace provider string, expected azureml://subscriptions/<subscription_id>/"
52
+ "resourceGroups/<resource_group>/providers/Microsoft.MachineLearningServices/"
53
+ f"workspaces/<workspace_name>, got {trace_provider}",
54
+ internal_message="Malformed trace provider string, expected azureml://subscriptions/<subscription_id>/"
55
+ "resourceGroups/<resource_group>/providers/Microsoft.MachineLearningServices/"
56
+ "workspaces/<workspace_name>,",
57
+ target=ErrorTarget.UNKNOWN,
58
+ category=ErrorCategory.INVALID_VALUE,
59
+ blame=ErrorBlame.UNKNOWN,
60
+ )
61
+ subscription_id = match.group(1)
62
+ resource_group_name = match.group(3)
63
+ workspace_name = match.group(5)
64
+ return AzureMLWorkspace(subscription_id, resource_group_name, workspace_name)
65
+
66
+
67
+ def load_jsonl(path):
68
+ with open(path, "r", encoding=DefaultOpenEncoding.READ) as f:
69
+ return [json.loads(line) for line in f.readlines()]
70
+
71
+
72
+ def _azure_pf_client_and_triad(trace_destination) -> Tuple[PFClient, AzureMLWorkspace]:
73
+ from promptflow.azure._cli._utils import _get_azure_pf_client
74
+
75
+ ws_triad = extract_workspace_triad_from_trace_provider(trace_destination)
76
+ azure_pf_client = _get_azure_pf_client(
77
+ subscription_id=ws_triad.subscription_id,
78
+ resource_group=ws_triad.resource_group_name,
79
+ workspace_name=ws_triad.workspace_name,
80
+ )
81
+
82
+ return azure_pf_client, ws_triad
83
+
84
+
85
+ def _store_multimodal_content(messages, tmpdir: str):
86
+ # verify if images folder exists
87
+ images_folder_path = os.path.join(tmpdir, "images")
88
+ os.makedirs(images_folder_path, exist_ok=True)
89
+
90
+ # traverse all messages and replace base64 image data with new file name.
91
+ for message in messages:
92
+ if isinstance(message.get("content", []), list):
93
+ for content in message.get("content", []):
94
+ if content.get("type") == "image_url":
95
+ image_url = content.get("image_url")
96
+ if image_url and "url" in image_url and image_url["url"].startswith("data:image/jpg;base64,"):
97
+ # Extract the base64 string
98
+ base64image = image_url["url"].replace("data:image/jpg;base64,", "")
99
+
100
+ # Generate a unique filename
101
+ image_file_name = f"{str(uuid.uuid4())}.jpg"
102
+ image_url["url"] = f"images/{image_file_name}" # Replace the base64 URL with the file path
103
+
104
+ # Decode the base64 string to binary image data
105
+ image_data_binary = base64.b64decode(base64image)
106
+
107
+ # Write the binary image data to the file
108
+ image_file_path = os.path.join(images_folder_path, image_file_name)
109
+ with open(image_file_path, "wb") as f:
110
+ f.write(image_data_binary)
111
+
112
+
113
+ def _log_metrics_and_instance_results(
114
+ metrics: Dict[str, Any],
115
+ instance_results: pd.DataFrame,
116
+ trace_destination: Optional[str],
117
+ run: Run,
118
+ evaluation_name: Optional[str],
119
+ ) -> Optional[str]:
120
+ from azure.ai.evaluation._evaluate._eval_run import EvalRun
121
+
122
+ if trace_destination is None:
123
+ LOGGER.debug("Skip uploading evaluation results to AI Studio since no trace destination was provided.")
124
+ return None
125
+
126
+ azure_pf_client, ws_triad = _azure_pf_client_and_triad(trace_destination)
127
+ tracking_uri = azure_pf_client.ml_client.workspaces.get(ws_triad.workspace_name).mlflow_tracking_uri
128
+
129
+ # Adding line_number as index column this is needed by UI to form link to individual instance run
130
+ instance_results["line_number"] = instance_results.index.values
131
+
132
+ with EvalRun(
133
+ run_name=run.name if run is not None else evaluation_name,
134
+ tracking_uri=tracking_uri,
135
+ subscription_id=ws_triad.subscription_id,
136
+ group_name=ws_triad.resource_group_name,
137
+ workspace_name=ws_triad.workspace_name,
138
+ ml_client=azure_pf_client.ml_client,
139
+ promptflow_run=run,
140
+ ) as ev_run:
141
+ artifact_name = EvalRun.EVALUATION_ARTIFACT
142
+
143
+ with tempfile.TemporaryDirectory() as tmpdir:
144
+ # storing multi_modal images if exists
145
+ col_name = "inputs.conversation"
146
+ if col_name in instance_results.columns:
147
+ for item in instance_results[col_name].items():
148
+ value = item[1]
149
+ if "messages" in value:
150
+ _store_multimodal_content(value["messages"], tmpdir)
151
+
152
+ # storing artifact result
153
+ tmp_path = os.path.join(tmpdir, artifact_name)
154
+
155
+ with open(tmp_path, "w", encoding=DefaultOpenEncoding.WRITE) as f:
156
+ f.write(instance_results.to_json(orient="records", lines=True))
157
+
158
+ ev_run.log_artifact(tmpdir, artifact_name)
159
+
160
+ # Using mlflow to create a dummy run since once created via PF show traces of dummy run in UI.
161
+ # Those traces can be confusing.
162
+ # adding these properties to avoid showing traces if a dummy run is created.
163
+ # We are doing that only for the pure evaluation runs.
164
+ if run is None:
165
+ ev_run.write_properties_to_run_history(
166
+ properties={
167
+ EvaluationRunProperties.RUN_TYPE: "eval_run",
168
+ EvaluationRunProperties.EVALUATION_RUN: "promptflow.BatchRun",
169
+ "_azureml.evaluate_artifacts": json.dumps([{"path": artifact_name, "type": "table"}]),
170
+ }
171
+ )
172
+
173
+ for metric_name, metric_value in metrics.items():
174
+ ev_run.log_metric(metric_name, metric_value)
175
+
176
+ evaluation_id = ev_run.info.run_name if run is not None else ev_run.info.run_id
177
+ return _get_ai_studio_url(trace_destination=trace_destination, evaluation_id=evaluation_id)
178
+
179
+
180
+ def _get_ai_studio_url(trace_destination: str, evaluation_id: str) -> str:
181
+ ws_triad = extract_workspace_triad_from_trace_provider(trace_destination)
182
+ studio_base_url = os.getenv("AI_STUDIO_BASE_URL", "https://ai.azure.com")
183
+
184
+ studio_url = (
185
+ f"{studio_base_url}/build/evaluation/{evaluation_id}?wsid=/subscriptions/{ws_triad.subscription_id}"
186
+ f"/resourceGroups/{ws_triad.resource_group_name}/providers/Microsoft.MachineLearningServices/"
187
+ f"workspaces/{ws_triad.workspace_name}"
188
+ )
189
+
190
+ return studio_url
191
+
192
+
193
+ def _trace_destination_from_project_scope(project_scope: AzureAIProject) -> str:
194
+ subscription_id = project_scope["subscription_id"]
195
+ resource_group_name = project_scope["resource_group_name"]
196
+ workspace_name = project_scope["project_name"]
197
+
198
+ trace_destination = (
199
+ f"azureml://subscriptions/{subscription_id}/resourceGroups/{resource_group_name}/"
200
+ f"providers/Microsoft.MachineLearningServices/workspaces/{workspace_name}"
201
+ )
202
+
203
+ return trace_destination
204
+
205
+
206
+ def _write_output(path: Union[str, os.PathLike], data_dict: Any) -> None:
207
+ p = Path(path)
208
+ if p.is_dir():
209
+ p = p / DEFAULT_EVALUATION_RESULTS_FILE_NAME
210
+
211
+ with open(p, "w", encoding=DefaultOpenEncoding.WRITE) as f:
212
+ json.dump(data_dict, f)
213
+
214
+ print(f'Evaluation results saved to "{p.resolve()}".\n')
215
+
216
+
217
+ def _apply_column_mapping(
218
+ source_df: pd.DataFrame, mapping_config: Optional[Dict[str, str]], inplace: bool = False
219
+ ) -> pd.DataFrame:
220
+ """
221
+ Apply column mapping to source_df based on mapping_config.
222
+
223
+ This function is used for pre-validation of input data for evaluators
224
+ :param source_df: the data frame to be changed.
225
+ :type source_df: pd.DataFrame
226
+ :param mapping_config: The configuration, containing column mapping.
227
+ :type mapping_config: Dict[str, str].
228
+ :param inplace: If true, the source_df will be changed inplace.
229
+ :type inplace: bool
230
+ :return: The modified data frame.
231
+ :rtype: pd.DataFrame
232
+ """
233
+ result_df = source_df
234
+
235
+ if mapping_config:
236
+ column_mapping = {}
237
+ columns_to_drop = set()
238
+ pattern_prefix = "data."
239
+ run_outputs_prefix = "run.outputs."
240
+
241
+ for map_to_key, map_value in mapping_config.items():
242
+ match = re.search(r"^\${([^{}]+)}$", map_value)
243
+ if match is not None:
244
+ pattern = match.group(1)
245
+ if pattern.startswith(pattern_prefix):
246
+ map_from_key = pattern[len(pattern_prefix) :]
247
+ elif pattern.startswith(run_outputs_prefix):
248
+ # Target-generated columns always starts from .outputs.
249
+ map_from_key = f"{Prefixes.TSG_OUTPUTS}{pattern[len(run_outputs_prefix) :]}"
250
+ # if we are not renaming anything, skip.
251
+ if map_from_key == map_to_key:
252
+ continue
253
+ # If column needs to be mapped to already existing column, we will add it
254
+ # to the drop list.
255
+ if map_to_key in source_df.columns:
256
+ columns_to_drop.add(map_to_key)
257
+ column_mapping[map_from_key] = map_to_key
258
+ # If we map column to another one, which is already present in the data
259
+ # set and the letter also needs to be mapped, we will not drop it, but map
260
+ # instead.
261
+ columns_to_drop = columns_to_drop - set(column_mapping.keys())
262
+ result_df = source_df.drop(columns=columns_to_drop, inplace=inplace)
263
+ result_df.rename(columns=column_mapping, inplace=True)
264
+
265
+ return result_df
266
+
267
+
268
+ def _has_aggregator(evaluator: object) -> bool:
269
+ return hasattr(evaluator, "__aggregate__")
270
+
271
+
272
+ def get_int_env_var(env_var_name: str, default_value: int) -> int:
273
+ """
274
+ The function `get_int_env_var` retrieves an integer environment variable value, with a
275
+ default value if the variable is not set or cannot be converted to an integer.
276
+
277
+ :param env_var_name: The name of the environment variable you want to retrieve the value of
278
+ :type env_var_name: str
279
+ :param default_value: The default value is the value that will be returned if the environment
280
+ variable is not found or if it cannot be converted to an integer
281
+ :type default_value: int
282
+ :return: an integer value.
283
+ :rtype: int
284
+ """
285
+ try:
286
+ return int(os.environ[env_var_name])
287
+ except (ValueError, KeyError):
288
+ return default_value
289
+
290
+
291
+ def set_event_loop_policy() -> None:
292
+ import asyncio
293
+ import platform
294
+
295
+ if platform.system().lower() == "windows":
296
+ # Reference: https://stackoverflow.com/questions/45600579/asyncio-event-loop-is-closed-when-getting-loop
297
+ # On Windows seems to be a problem with EventLoopPolicy, use this snippet to work around it
298
+ asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy()) # type: ignore[attr-defined]
@@ -0,0 +1,3 @@
1
+ # ---------------------------------------------------------
2
+ # Copyright (c) Microsoft Corporation. All rights reserved.
3
+ # ---------------------------------------------------------
@@ -0,0 +1,9 @@
1
+ # ---------------------------------------------------------
2
+ # Copyright (c) Microsoft Corporation. All rights reserved.
3
+ # ---------------------------------------------------------
4
+
5
+ from ._bleu import BleuScoreEvaluator
6
+
7
+ __all__ = [
8
+ "BleuScoreEvaluator",
9
+ ]
@@ -0,0 +1,72 @@
1
+ # ---------------------------------------------------------
2
+ # Copyright (c) Microsoft Corporation. All rights reserved.
3
+ # ---------------------------------------------------------
4
+ from nltk.translate.bleu_score import SmoothingFunction, sentence_bleu
5
+ from promptflow._utils.async_utils import async_run_allowing_running_loop
6
+
7
+ from azure.ai.evaluation._common.utils import nltk_tokenize
8
+
9
+
10
+ class _AsyncBleuScoreEvaluator:
11
+ def __init__(self):
12
+ pass
13
+
14
+ async def __call__(self, *, response: str, ground_truth: str, **kwargs):
15
+ reference_tokens = nltk_tokenize(ground_truth)
16
+ hypothesis_tokens = nltk_tokenize(response)
17
+
18
+ # NIST Smoothing
19
+ smoothing_function = SmoothingFunction().method4
20
+ score = sentence_bleu([reference_tokens], hypothesis_tokens, smoothing_function=smoothing_function)
21
+
22
+ return {
23
+ "bleu_score": score,
24
+ }
25
+
26
+
27
+ class BleuScoreEvaluator:
28
+ """
29
+ Calculate the BLEU score for a given response and ground truth.
30
+
31
+ BLEU (Bilingual Evaluation Understudy) score is commonly used in natural language processing (NLP) and machine
32
+ translation. It is widely used in text summarization and text generation use cases.
33
+
34
+ Use the BLEU score when you want to evaluate the similarity between the generated text and reference text,
35
+ especially in tasks such as machine translation or text summarization, where n-gram overlap is a significant
36
+ indicator of quality.
37
+
38
+ The BLEU score ranges from 0 to 1, with higher scores indicating better quality.
39
+
40
+ .. admonition:: Example:
41
+
42
+ .. literalinclude:: ../samples/evaluation_samples_evaluate.py
43
+ :start-after: [START bleu_score_evaluator]
44
+ :end-before: [END bleu_score_evaluator]
45
+ :language: python
46
+ :dedent: 8
47
+ :caption: Initialize and call an BleuScoreEvaluator.
48
+ """
49
+
50
+ id = "azureml://registries/azureml/models/Bleu-Score-Evaluator/versions/3"
51
+ """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
52
+
53
+ def __init__(self):
54
+ self._async_evaluator = _AsyncBleuScoreEvaluator()
55
+
56
+ def __call__(self, *, response: str, ground_truth: str, **kwargs):
57
+ """
58
+ Evaluate the BLEU score between the response and the ground truth.
59
+
60
+ :keyword response: The response to be evaluated.
61
+ :paramtype response: str
62
+ :keyword ground_truth: The ground truth to be compared against.
63
+ :paramtype ground_truth: str
64
+ :return: The BLEU score.
65
+ :rtype: Dict[str, float]
66
+ """
67
+ return async_run_allowing_running_loop(
68
+ self._async_evaluator, response=response, ground_truth=ground_truth, **kwargs
69
+ )
70
+
71
+ def _to_async(self):
72
+ return self._async_evaluator
@@ -0,0 +1,7 @@
1
+ # ---------------------------------------------------------
2
+ # Copyright (c) Microsoft Corporation. All rights reserved.
3
+ # ---------------------------------------------------------
4
+
5
+ from ._coherence import CoherenceEvaluator
6
+
7
+ __all__ = ["CoherenceEvaluator"]
@@ -0,0 +1,107 @@
1
+ # ---------------------------------------------------------
2
+ # Copyright (c) Microsoft Corporation. All rights reserved.
3
+ # ---------------------------------------------------------
4
+ import os
5
+ from typing import Dict, Union, List
6
+
7
+ from typing_extensions import overload, override
8
+
9
+ from azure.ai.evaluation._evaluators._common import PromptyEvaluatorBase
10
+ from azure.ai.evaluation._model_configurations import Conversation
11
+
12
+
13
+ class CoherenceEvaluator(PromptyEvaluatorBase[Union[str, float]]):
14
+ """
15
+ Evaluates coherence score for a given query and response or a multi-turn conversation, including reasoning.
16
+
17
+ The coherence measure assesses the ability of the language model to generate text that reads naturally,
18
+ flows smoothly, and resembles human-like language in its responses. Use it when assessing the readability
19
+ and user-friendliness of a model's generated responses in real-world applications.
20
+
21
+ :param model_config: Configuration for the Azure OpenAI model.
22
+ :type model_config: Union[~azure.ai.evaluation.AzureOpenAIModelConfiguration,
23
+ ~azure.ai.evaluation.OpenAIModelConfiguration]
24
+
25
+ .. admonition:: Example:
26
+
27
+ .. literalinclude:: ../samples/evaluation_samples_evaluate.py
28
+ :start-after: [START coherence_evaluator]
29
+ :end-before: [END coherence_evaluator]
30
+ :language: python
31
+ :dedent: 8
32
+ :caption: Initialize and call a CoherenceEvaluator with a query and response.
33
+
34
+ .. note::
35
+
36
+ To align with our support of a diverse set of models, an output key without the `gpt_` prefix has been added.
37
+ To maintain backwards compatibility, the old key with the `gpt_` prefix is still be present in the output;
38
+ however, it is recommended to use the new key moving forward as the old key will be deprecated in the future.
39
+ """
40
+
41
+ _PROMPTY_FILE = "coherence.prompty"
42
+ _RESULT_KEY = "coherence"
43
+
44
+ id = "azureml://registries/azureml/models/Coherence-Evaluator/versions/4"
45
+ """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
46
+
47
+ @override
48
+ def __init__(self, model_config):
49
+ current_dir = os.path.dirname(__file__)
50
+ prompty_path = os.path.join(current_dir, self._PROMPTY_FILE)
51
+ super().__init__(model_config=model_config, prompty_file=prompty_path, result_key=self._RESULT_KEY)
52
+
53
+ @overload
54
+ def __call__(
55
+ self,
56
+ *,
57
+ query: str,
58
+ response: str,
59
+ ) -> Dict[str, Union[str, float]]:
60
+ """Evaluate coherence for given input of query, response
61
+
62
+ :keyword query: The query to be evaluated.
63
+ :paramtype query: str
64
+ :keyword response: The response to be evaluated.
65
+ :paramtype response: str
66
+ :return: The coherence score.
67
+ :rtype: Dict[str, float]
68
+ """
69
+
70
+ @overload
71
+ def __call__(
72
+ self,
73
+ *,
74
+ conversation: Conversation,
75
+ ) -> Dict[str, Union[float, Dict[str, List[Union[str, float]]]]]:
76
+ """Evaluate coherence for a conversation
77
+
78
+ :keyword conversation: The conversation to evaluate. Expected to contain a list of conversation turns under the
79
+ key "messages", and potentially a global context under the key "context". Conversation turns are expected
80
+ to be dictionaries with keys "content", "role", and possibly "context".
81
+ :paramtype conversation: Optional[~azure.ai.evaluation.Conversation]
82
+ :return: The coherence score.
83
+ :rtype: Dict[str, Union[float, Dict[str, List[float]]]]
84
+ """
85
+
86
+ @override
87
+ def __call__( # pylint: disable=docstring-missing-param
88
+ self,
89
+ *args,
90
+ **kwargs,
91
+ ):
92
+ """Evaluate coherence. Accepts either a query and response for a single evaluation,
93
+ or a conversation for a potentially multi-turn evaluation. If the conversation has more than one pair of
94
+ turns, the evaluator will aggregate the results of each turn.
95
+
96
+ :keyword query: The query to be evaluated.
97
+ :paramtype query: str
98
+ :keyword response: The response to be evaluated.
99
+ :paramtype response: Optional[str]
100
+ :keyword conversation: The conversation to evaluate. Expected to contain a list of conversation turns under the
101
+ key "messages". Conversation turns are expected
102
+ to be dictionaries with keys "content" and "role".
103
+ :paramtype conversation: Optional[~azure.ai.evaluation.Conversation]
104
+ :return: The relevance score.
105
+ :rtype: Union[Dict[str, float], Dict[str, Union[float, Dict[str, List[float]]]]]
106
+ """
107
+ return super().__call__(*args, **kwargs)