azure-ai-evaluation 1.4.0__py3-none-any.whl → 1.6.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of azure-ai-evaluation might be problematic. Click here for more details.
- azure/ai/evaluation/__init__.py +9 -16
- azure/ai/evaluation/_aoai/__init__.py +10 -0
- azure/ai/evaluation/_aoai/aoai_grader.py +89 -0
- azure/ai/evaluation/_aoai/label_grader.py +66 -0
- azure/ai/evaluation/_aoai/string_check_grader.py +65 -0
- azure/ai/evaluation/_aoai/text_similarity_grader.py +88 -0
- azure/ai/evaluation/_azure/_clients.py +4 -4
- azure/ai/evaluation/_azure/_envs.py +208 -0
- azure/ai/evaluation/_azure/_token_manager.py +12 -7
- azure/ai/evaluation/_common/__init__.py +5 -0
- azure/ai/evaluation/_common/evaluation_onedp_client.py +118 -0
- azure/ai/evaluation/_common/onedp/__init__.py +32 -0
- azure/ai/evaluation/_common/onedp/_client.py +139 -0
- azure/ai/evaluation/_common/onedp/_configuration.py +73 -0
- azure/ai/evaluation/_common/onedp/_model_base.py +1232 -0
- azure/ai/evaluation/_common/onedp/_patch.py +21 -0
- azure/ai/evaluation/_common/onedp/_serialization.py +2032 -0
- azure/ai/evaluation/_common/onedp/_types.py +21 -0
- azure/ai/evaluation/_common/onedp/_validation.py +50 -0
- azure/ai/evaluation/_common/onedp/_vendor.py +50 -0
- azure/ai/evaluation/_common/onedp/_version.py +9 -0
- azure/ai/evaluation/_common/onedp/aio/__init__.py +29 -0
- azure/ai/evaluation/_common/onedp/aio/_client.py +143 -0
- azure/ai/evaluation/_common/onedp/aio/_configuration.py +75 -0
- azure/ai/evaluation/_common/onedp/aio/_patch.py +21 -0
- azure/ai/evaluation/_common/onedp/aio/_vendor.py +40 -0
- azure/ai/evaluation/_common/onedp/aio/operations/__init__.py +39 -0
- azure/ai/evaluation/_common/onedp/aio/operations/_operations.py +4494 -0
- azure/ai/evaluation/_common/onedp/aio/operations/_patch.py +21 -0
- azure/ai/evaluation/_common/onedp/models/__init__.py +142 -0
- azure/ai/evaluation/_common/onedp/models/_enums.py +162 -0
- azure/ai/evaluation/_common/onedp/models/_models.py +2228 -0
- azure/ai/evaluation/_common/onedp/models/_patch.py +21 -0
- azure/ai/evaluation/_common/onedp/operations/__init__.py +39 -0
- azure/ai/evaluation/_common/onedp/operations/_operations.py +5655 -0
- azure/ai/evaluation/_common/onedp/operations/_patch.py +21 -0
- azure/ai/evaluation/_common/onedp/py.typed +1 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/__init__.py +1 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/aio/__init__.py +1 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/aio/operations/__init__.py +25 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/aio/operations/_operations.py +34 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/aio/operations/_patch.py +20 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/__init__.py +1 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/__init__.py +1 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/operations/__init__.py +22 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/operations/_operations.py +29 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/operations/_patch.py +20 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/operations/__init__.py +22 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/operations/_operations.py +29 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/operations/_patch.py +20 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/operations/__init__.py +25 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/operations/_operations.py +34 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/operations/_patch.py +20 -0
- azure/ai/evaluation/_common/rai_service.py +159 -29
- azure/ai/evaluation/_common/raiclient/_version.py +1 -1
- azure/ai/evaluation/_common/utils.py +80 -2
- azure/ai/evaluation/_constants.py +16 -0
- azure/ai/evaluation/_converters/__init__.py +1 -1
- azure/ai/evaluation/_converters/_ai_services.py +4 -4
- azure/ai/evaluation/_eval_mapping.py +71 -0
- azure/ai/evaluation/_evaluate/_batch_run/_run_submitter_client.py +30 -16
- azure/ai/evaluation/_evaluate/_batch_run/code_client.py +18 -12
- azure/ai/evaluation/_evaluate/_batch_run/eval_run_context.py +17 -4
- azure/ai/evaluation/_evaluate/_batch_run/proxy_client.py +47 -22
- azure/ai/evaluation/_evaluate/_batch_run/target_run_context.py +18 -2
- azure/ai/evaluation/_evaluate/_eval_run.py +2 -2
- azure/ai/evaluation/_evaluate/_evaluate.py +372 -105
- azure/ai/evaluation/_evaluate/_evaluate_aoai.py +534 -0
- azure/ai/evaluation/_evaluate/_telemetry/__init__.py +5 -89
- azure/ai/evaluation/_evaluate/_utils.py +120 -7
- azure/ai/evaluation/_evaluators/_common/_base_eval.py +9 -4
- azure/ai/evaluation/_evaluators/_common/_base_multi_eval.py +1 -1
- azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +12 -3
- azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py +2 -2
- azure/ai/evaluation/_evaluators/_document_retrieval/__init__.py +11 -0
- azure/ai/evaluation/_evaluators/_document_retrieval/_document_retrieval.py +467 -0
- azure/ai/evaluation/_evaluators/_fluency/_fluency.py +1 -1
- azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +2 -2
- azure/ai/evaluation/_evaluators/_intent_resolution/_intent_resolution.py +6 -2
- azure/ai/evaluation/_evaluators/_relevance/_relevance.py +1 -1
- azure/ai/evaluation/_evaluators/_response_completeness/_response_completeness.py +8 -2
- azure/ai/evaluation/_evaluators/_response_completeness/response_completeness.prompty +31 -46
- azure/ai/evaluation/_evaluators/_similarity/_similarity.py +1 -1
- azure/ai/evaluation/_evaluators/_task_adherence/_task_adherence.py +5 -2
- azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py +6 -2
- azure/ai/evaluation/_exceptions.py +2 -0
- azure/ai/evaluation/_legacy/_adapters/__init__.py +7 -0
- azure/ai/evaluation/_legacy/_adapters/_check.py +17 -0
- azure/ai/evaluation/_legacy/_adapters/_configuration.py +45 -0
- azure/ai/evaluation/_legacy/_adapters/_constants.py +10 -0
- azure/ai/evaluation/_legacy/_adapters/_errors.py +29 -0
- azure/ai/evaluation/_legacy/_adapters/_flows.py +28 -0
- azure/ai/evaluation/_legacy/_adapters/_service.py +16 -0
- azure/ai/evaluation/_legacy/_adapters/client.py +51 -0
- azure/ai/evaluation/_legacy/_adapters/entities.py +26 -0
- azure/ai/evaluation/_legacy/_adapters/tracing.py +28 -0
- azure/ai/evaluation/_legacy/_adapters/types.py +15 -0
- azure/ai/evaluation/_legacy/_adapters/utils.py +31 -0
- azure/ai/evaluation/_legacy/_batch_engine/_engine.py +51 -32
- azure/ai/evaluation/_legacy/_batch_engine/_openai_injector.py +114 -8
- azure/ai/evaluation/_legacy/_batch_engine/_result.py +7 -1
- azure/ai/evaluation/_legacy/_batch_engine/_run.py +6 -0
- azure/ai/evaluation/_legacy/_batch_engine/_run_submitter.py +69 -29
- azure/ai/evaluation/_legacy/_batch_engine/_status.py +1 -1
- azure/ai/evaluation/_legacy/_batch_engine/_trace.py +54 -62
- azure/ai/evaluation/_legacy/_batch_engine/_utils.py +19 -1
- azure/ai/evaluation/{_red_team/_utils → _legacy/_common}/__init__.py +1 -1
- azure/ai/evaluation/_legacy/_common/_async_token_provider.py +124 -0
- azure/ai/evaluation/_legacy/_common/_thread_pool_executor_with_context.py +15 -0
- azure/ai/evaluation/_legacy/prompty/_connection.py +11 -74
- azure/ai/evaluation/_legacy/prompty/_exceptions.py +80 -0
- azure/ai/evaluation/_legacy/prompty/_prompty.py +119 -9
- azure/ai/evaluation/_legacy/prompty/_utils.py +72 -2
- azure/ai/evaluation/_safety_evaluation/_safety_evaluation.py +90 -17
- azure/ai/evaluation/_version.py +1 -1
- azure/ai/evaluation/red_team/__init__.py +19 -0
- azure/ai/evaluation/{_red_team → red_team}/_attack_objective_generator.py +3 -0
- azure/ai/evaluation/{_red_team → red_team}/_attack_strategy.py +4 -1
- azure/ai/evaluation/{_red_team → red_team}/_red_team.py +885 -481
- azure/ai/evaluation/red_team/_red_team_result.py +382 -0
- azure/ai/evaluation/{_red_team → red_team}/_utils/constants.py +2 -1
- azure/ai/evaluation/{_red_team → red_team}/_utils/formatting_utils.py +23 -22
- azure/ai/evaluation/{_red_team → red_team}/_utils/logging_utils.py +1 -1
- azure/ai/evaluation/red_team/_utils/metric_mapping.py +23 -0
- azure/ai/evaluation/{_red_team → red_team}/_utils/strategy_utils.py +9 -5
- azure/ai/evaluation/simulator/_adversarial_simulator.py +63 -39
- azure/ai/evaluation/simulator/_constants.py +1 -0
- azure/ai/evaluation/simulator/_conversation/__init__.py +13 -6
- azure/ai/evaluation/simulator/_conversation/_conversation.py +2 -1
- azure/ai/evaluation/simulator/_direct_attack_simulator.py +35 -22
- azure/ai/evaluation/simulator/_helpers/_language_suffix_mapping.py +1 -0
- azure/ai/evaluation/simulator/_indirect_attack_simulator.py +40 -25
- azure/ai/evaluation/simulator/_model_tools/__init__.py +2 -1
- azure/ai/evaluation/simulator/_model_tools/_generated_rai_client.py +24 -18
- azure/ai/evaluation/simulator/_model_tools/_identity_manager.py +5 -10
- azure/ai/evaluation/simulator/_model_tools/_proxy_completion_model.py +65 -41
- azure/ai/evaluation/simulator/_model_tools/_template_handler.py +9 -5
- azure/ai/evaluation/simulator/_model_tools/models.py +20 -17
- azure/ai/evaluation/simulator/_simulator.py +1 -1
- {azure_ai_evaluation-1.4.0.dist-info → azure_ai_evaluation-1.6.0.dist-info}/METADATA +36 -2
- {azure_ai_evaluation-1.4.0.dist-info → azure_ai_evaluation-1.6.0.dist-info}/RECORD +148 -80
- azure/ai/evaluation/_red_team/_red_team_result.py +0 -246
- azure/ai/evaluation/simulator/_tracing.py +0 -89
- /azure/ai/evaluation/_legacy/{_batch_engine → _common}/_logging.py +0 -0
- /azure/ai/evaluation/{_red_team → red_team}/_callback_chat_target.py +0 -0
- /azure/ai/evaluation/{_red_team → red_team}/_default_converter.py +0 -0
- /azure/ai/evaluation/{_red_team → red_team/_utils}/__init__.py +0 -0
- {azure_ai_evaluation-1.4.0.dist-info → azure_ai_evaluation-1.6.0.dist-info}/NOTICE.txt +0 -0
- {azure_ai_evaluation-1.4.0.dist-info → azure_ai_evaluation-1.6.0.dist-info}/WHEEL +0 -0
- {azure_ai_evaluation-1.4.0.dist-info → azure_ai_evaluation-1.6.0.dist-info}/top_level.txt +0 -0
|
@@ -10,9 +10,10 @@ from pathlib import Path
|
|
|
10
10
|
from typing import Any, Dict, NamedTuple, Optional, Union, cast
|
|
11
11
|
import uuid
|
|
12
12
|
import base64
|
|
13
|
+
import math
|
|
13
14
|
|
|
14
15
|
import pandas as pd
|
|
15
|
-
from
|
|
16
|
+
from azure.ai.evaluation._legacy._adapters.entities import Run
|
|
16
17
|
|
|
17
18
|
from azure.ai.evaluation._constants import (
|
|
18
19
|
DEFAULT_EVALUATION_RESULTS_FILE_NAME,
|
|
@@ -46,7 +47,7 @@ def is_none(value) -> bool:
|
|
|
46
47
|
def extract_workspace_triad_from_trace_provider( # pylint: disable=name-too-long
|
|
47
48
|
trace_provider: str,
|
|
48
49
|
) -> AzureMLWorkspace:
|
|
49
|
-
from
|
|
50
|
+
from azure.ai.evaluation._legacy._adapters.utils import get_workspace_triad_from_local
|
|
50
51
|
|
|
51
52
|
match = re.match(AZURE_WORKSPACE_REGEX_FORMAT, trace_provider)
|
|
52
53
|
if not match or len(match.groups()) != 5:
|
|
@@ -126,13 +127,90 @@ def process_message_content(content, images_folder_path):
|
|
|
126
127
|
f.write(image_data_binary)
|
|
127
128
|
return None
|
|
128
129
|
|
|
130
|
+
def _log_metrics_and_instance_results_onedp(
|
|
131
|
+
metrics: Dict[str, Any],
|
|
132
|
+
instance_results: pd.DataFrame,
|
|
133
|
+
project_url: str,
|
|
134
|
+
evaluation_name: Optional[str],
|
|
135
|
+
name_map: Dict[str, str],
|
|
136
|
+
**kwargs,
|
|
137
|
+
) -> Optional[str]:
|
|
138
|
+
|
|
139
|
+
# One RP Client
|
|
140
|
+
from azure.ai.evaluation._azure._token_manager import AzureMLTokenManager
|
|
141
|
+
from azure.ai.evaluation._constants import TokenScope
|
|
142
|
+
from azure.ai.evaluation._common import EvaluationServiceOneDPClient, EvaluationUpload
|
|
143
|
+
|
|
144
|
+
credentials = AzureMLTokenManager(
|
|
145
|
+
TokenScope.COGNITIVE_SERVICES_MANAGEMENT.value, LOGGER, credential=kwargs.get("credential")
|
|
146
|
+
)
|
|
147
|
+
client = EvaluationServiceOneDPClient(
|
|
148
|
+
endpoint=project_url,
|
|
149
|
+
credential=credentials
|
|
150
|
+
)
|
|
151
|
+
|
|
152
|
+
# Massaging before artifacts are put on disk
|
|
153
|
+
# Adding line_number as index column this is needed by UI to form link to individual instance run
|
|
154
|
+
instance_results["line_number"] = instance_results.index.values
|
|
155
|
+
|
|
156
|
+
artifact_name = "instance_results.jsonl"
|
|
157
|
+
|
|
158
|
+
with tempfile.TemporaryDirectory() as tmpdir:
|
|
159
|
+
# storing multi_modal images if exists
|
|
160
|
+
col_name = "inputs.conversation"
|
|
161
|
+
if col_name in instance_results.columns:
|
|
162
|
+
for item in instance_results[col_name].items():
|
|
163
|
+
value = item[1]
|
|
164
|
+
if "messages" in value:
|
|
165
|
+
_store_multimodal_content(value["messages"], tmpdir)
|
|
166
|
+
|
|
167
|
+
# storing artifact result
|
|
168
|
+
tmp_path = os.path.join(tmpdir, artifact_name)
|
|
169
|
+
|
|
170
|
+
with open(tmp_path, "w", encoding=DefaultOpenEncoding.WRITE) as f:
|
|
171
|
+
f.write(instance_results.to_json(orient="records", lines=True))
|
|
172
|
+
|
|
173
|
+
properties = {
|
|
174
|
+
EvaluationRunProperties.RUN_TYPE: "eval_run",
|
|
175
|
+
EvaluationRunProperties.EVALUATION_RUN: "promptflow.BatchRun",
|
|
176
|
+
EvaluationRunProperties.EVALUATION_SDK: f"azure-ai-evaluation:{VERSION}",
|
|
177
|
+
"_azureml.evaluate_artifacts": json.dumps([{"path": artifact_name, "type": "table"}]),
|
|
178
|
+
}
|
|
179
|
+
properties.update(_convert_name_map_into_property_entries(name_map))
|
|
180
|
+
|
|
181
|
+
create_evaluation_result_response = client.create_evaluation_result(
|
|
182
|
+
name=uuid.uuid4(),
|
|
183
|
+
path=tmpdir,
|
|
184
|
+
metrics=metrics
|
|
185
|
+
)
|
|
186
|
+
|
|
187
|
+
upload_run_response = client.start_evaluation_run(
|
|
188
|
+
evaluation=EvaluationUpload(
|
|
189
|
+
display_name=evaluation_name,
|
|
190
|
+
)
|
|
191
|
+
)
|
|
192
|
+
|
|
193
|
+
update_run_response = client.update_evaluation_run(
|
|
194
|
+
name=upload_run_response.id,
|
|
195
|
+
evaluation=EvaluationUpload(
|
|
196
|
+
display_name=evaluation_name,
|
|
197
|
+
status="Completed",
|
|
198
|
+
outputs={
|
|
199
|
+
'evaluationResultId': create_evaluation_result_response.id,
|
|
200
|
+
},
|
|
201
|
+
properties=properties,
|
|
202
|
+
)
|
|
203
|
+
)
|
|
204
|
+
|
|
205
|
+
return update_run_response.properties.get("AiStudioEvaluationUri")
|
|
129
206
|
|
|
130
207
|
def _log_metrics_and_instance_results(
|
|
131
208
|
metrics: Dict[str, Any],
|
|
132
209
|
instance_results: pd.DataFrame,
|
|
133
210
|
trace_destination: Optional[str],
|
|
134
|
-
run: Run,
|
|
211
|
+
run: Optional[Run],
|
|
135
212
|
evaluation_name: Optional[str],
|
|
213
|
+
name_map: Dict[str, str],
|
|
136
214
|
**kwargs,
|
|
137
215
|
) -> Optional[str]:
|
|
138
216
|
from azure.ai.evaluation._evaluate._eval_run import EvalRun
|
|
@@ -187,14 +265,14 @@ def _log_metrics_and_instance_results(
|
|
|
187
265
|
# adding these properties to avoid showing traces if a dummy run is created.
|
|
188
266
|
# We are doing that only for the pure evaluation runs.
|
|
189
267
|
if run is None:
|
|
190
|
-
|
|
191
|
-
properties={
|
|
268
|
+
properties = {
|
|
192
269
|
EvaluationRunProperties.RUN_TYPE: "eval_run",
|
|
193
270
|
EvaluationRunProperties.EVALUATION_RUN: "promptflow.BatchRun",
|
|
194
271
|
EvaluationRunProperties.EVALUATION_SDK: f"azure-ai-evaluation:{VERSION}",
|
|
195
272
|
"_azureml.evaluate_artifacts": json.dumps([{"path": artifact_name, "type": "table"}]),
|
|
196
273
|
}
|
|
197
|
-
)
|
|
274
|
+
properties.update(_convert_name_map_into_property_entries(name_map))
|
|
275
|
+
ev_run.write_properties_to_run_history(properties=properties)
|
|
198
276
|
else:
|
|
199
277
|
ev_run.write_properties_to_run_history(
|
|
200
278
|
properties={
|
|
@@ -241,7 +319,7 @@ def _write_output(path: Union[str, os.PathLike], data_dict: Any) -> None:
|
|
|
241
319
|
p = p / DEFAULT_EVALUATION_RESULTS_FILE_NAME
|
|
242
320
|
|
|
243
321
|
with open(p, "w", encoding=DefaultOpenEncoding.WRITE) as f:
|
|
244
|
-
json.dump(data_dict, f)
|
|
322
|
+
json.dump(data_dict, f, ensure_ascii=False)
|
|
245
323
|
|
|
246
324
|
print(f'Evaluation results saved to "{p.resolve()}".\n')
|
|
247
325
|
|
|
@@ -329,6 +407,41 @@ def set_event_loop_policy() -> None:
|
|
|
329
407
|
# On Windows seems to be a problem with EventLoopPolicy, use this snippet to work around it
|
|
330
408
|
asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy()) # type: ignore[attr-defined]
|
|
331
409
|
|
|
410
|
+
# textwrap.wrap tries to do fancy nonsense that we don't want
|
|
411
|
+
def _wrap(s, w):
|
|
412
|
+
return [s[i:i + w] for i in range(0, len(s), w)]
|
|
413
|
+
|
|
414
|
+
def _convert_name_map_into_property_entries(
|
|
415
|
+
name_map: Dict[str, str], segment_length: int = 950, max_segments: int = 10
|
|
416
|
+
) -> Dict[str, Any]:
|
|
417
|
+
"""
|
|
418
|
+
Convert the name map into property entries.
|
|
419
|
+
|
|
420
|
+
:param name_map: The name map to be converted.
|
|
421
|
+
:type name_map: Dict[str, str]
|
|
422
|
+
:param segment_length: The max length of each individual segment,
|
|
423
|
+
which will each have their own dictionary entry
|
|
424
|
+
:type segment_length: str
|
|
425
|
+
:param max_segments: The max number of segments we can have. If the stringified
|
|
426
|
+
name map is too long, we just return a length entry with a value
|
|
427
|
+
of -1 to indicate that the map was too long.
|
|
428
|
+
:type max_segments: str
|
|
429
|
+
:return: The converted name map.
|
|
430
|
+
:rtype: Dict[str, Any]
|
|
431
|
+
"""
|
|
432
|
+
name_map_string = json.dumps(name_map)
|
|
433
|
+
num_segments = math.ceil(len(name_map_string) / segment_length)
|
|
434
|
+
# Property map is somehow still too long to encode within the space
|
|
435
|
+
# we allow, so give up, but make sure the service knows we gave up
|
|
436
|
+
if (num_segments > max_segments):
|
|
437
|
+
return {EvaluationRunProperties.NAME_MAP_LENGTH: -1}
|
|
438
|
+
|
|
439
|
+
result: Dict[str, Any] = {EvaluationRunProperties.NAME_MAP_LENGTH: num_segments}
|
|
440
|
+
segments_list = _wrap(name_map_string, segment_length)
|
|
441
|
+
for i in range(0, num_segments):
|
|
442
|
+
segment_key = f"{EvaluationRunProperties.NAME_MAP}_{i}"
|
|
443
|
+
result[segment_key] = segments_list[i]
|
|
444
|
+
return result
|
|
332
445
|
|
|
333
446
|
class JSONLDataFileLoader:
|
|
334
447
|
def __init__(self, filename: Union[os.PathLike, str]):
|
|
@@ -6,7 +6,7 @@ import inspect
|
|
|
6
6
|
from abc import ABC, abstractmethod
|
|
7
7
|
from typing import Any, Callable, Dict, Generic, List, TypedDict, TypeVar, Union, cast, final, Optional
|
|
8
8
|
|
|
9
|
-
from
|
|
9
|
+
from azure.ai.evaluation._legacy._adapters.utils import async_run_allowing_running_loop
|
|
10
10
|
from typing_extensions import ParamSpec, TypeAlias, get_overloads
|
|
11
11
|
|
|
12
12
|
from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
|
|
@@ -495,7 +495,8 @@ class AsyncEvaluatorBase:
|
|
|
495
495
|
# are known to throw at this, mash them into kwargs, and then pass them into the real call.
|
|
496
496
|
async def __call__(
|
|
497
497
|
self, *, query=None, response=None, context=None, conversation=None, ground_truth=None,
|
|
498
|
-
|
|
498
|
+
tool_calls=None, tool_definitions=None, messages=None, retrieval_ground_truth=None,
|
|
499
|
+
retrieved_documents=None,**kwargs
|
|
499
500
|
):
|
|
500
501
|
if conversation is not None:
|
|
501
502
|
kwargs["conversation"] = conversation
|
|
@@ -509,11 +510,15 @@ class AsyncEvaluatorBase:
|
|
|
509
510
|
kwargs["context"] = context
|
|
510
511
|
if ground_truth is not None:
|
|
511
512
|
kwargs["ground_truth"] = ground_truth
|
|
512
|
-
if
|
|
513
|
-
kwargs["
|
|
513
|
+
if tool_calls is not None:
|
|
514
|
+
kwargs["tool_calls"] = tool_calls
|
|
514
515
|
if tool_definitions is not None:
|
|
515
516
|
kwargs["tool_definitions"] = tool_definitions
|
|
516
517
|
if messages is not None:
|
|
517
518
|
kwargs["messages"] = messages
|
|
519
|
+
if retrieval_ground_truth is not None:
|
|
520
|
+
kwargs["retrieval_ground_truth"] = retrieval_ground_truth
|
|
521
|
+
if retrieved_documents is not None:
|
|
522
|
+
kwargs["retrieved_documents"] = retrieved_documents
|
|
518
523
|
|
|
519
524
|
return await self._real_call(**kwargs)
|
|
@@ -4,7 +4,7 @@
|
|
|
4
4
|
from concurrent.futures import as_completed
|
|
5
5
|
from typing import TypeVar, Dict, List
|
|
6
6
|
|
|
7
|
-
from
|
|
7
|
+
from azure.ai.evaluation._legacy._adapters.tracing import ThreadPoolExecutorWithContext as ThreadPoolExecutor
|
|
8
8
|
from typing_extensions import override
|
|
9
9
|
|
|
10
10
|
from azure.ai.evaluation._evaluators._common import EvaluatorBase
|
|
@@ -4,9 +4,13 @@
|
|
|
4
4
|
|
|
5
5
|
import math
|
|
6
6
|
import re
|
|
7
|
+
import os
|
|
7
8
|
from typing import Dict, TypeVar, Union
|
|
8
9
|
|
|
9
|
-
|
|
10
|
+
if os.getenv("AI_EVALS_USE_PF_PROMPTY", "false").lower() == "true":
|
|
11
|
+
from promptflow.core._flow import AsyncPrompty
|
|
12
|
+
else:
|
|
13
|
+
from azure.ai.evaluation._legacy.prompty import AsyncPrompty
|
|
10
14
|
from typing_extensions import override
|
|
11
15
|
|
|
12
16
|
from azure.ai.evaluation._common.constants import PROMPT_BASED_REASON_EVALUATORS
|
|
@@ -39,13 +43,17 @@ class PromptyEvaluatorBase(EvaluatorBase[T]):
|
|
|
39
43
|
:param ignore_queries: If True, queries will be ignored in conversation evaluations. Default is False.
|
|
40
44
|
Useful since some evaluators of this format are response-only.
|
|
41
45
|
:type ignore_queries: bool
|
|
46
|
+
:keyword is_reasoning_model: This parameter is in preview. If True, updates the config parameters in prompty file based on reasoning models. Defaults to False.
|
|
47
|
+
:type is_reasoning_model: bool
|
|
42
48
|
"""
|
|
43
49
|
|
|
44
50
|
_LLM_CALL_TIMEOUT = 600
|
|
45
51
|
_DEFAULT_OPEN_API_VERSION = "2024-02-15-preview"
|
|
46
52
|
|
|
47
|
-
def __init__(self, *, result_key: str, prompty_file: str, model_config: dict, eval_last_turn: bool = False,
|
|
53
|
+
def __init__(self, *, result_key: str, prompty_file: str, model_config: dict, eval_last_turn: bool = False,
|
|
54
|
+
threshold: int = 3, _higher_is_better: bool = False, **kwargs) -> None:
|
|
48
55
|
self._result_key = result_key
|
|
56
|
+
self._is_reasoning_model = kwargs.get("is_reasoning_model", False)
|
|
49
57
|
self._prompty_file = prompty_file
|
|
50
58
|
self._threshold = threshold
|
|
51
59
|
self._higher_is_better = _higher_is_better
|
|
@@ -59,7 +67,8 @@ class PromptyEvaluatorBase(EvaluatorBase[T]):
|
|
|
59
67
|
user_agent,
|
|
60
68
|
)
|
|
61
69
|
|
|
62
|
-
self._flow = AsyncPrompty.load(source=
|
|
70
|
+
self._flow = AsyncPrompty.load(source=self._prompty_file, model=prompty_model_config,
|
|
71
|
+
is_reasoning_model=self._is_reasoning_model)
|
|
63
72
|
|
|
64
73
|
# __call__ not overridden here because child classes have such varied signatures that there's no point
|
|
65
74
|
# defining a default here.
|
|
@@ -12,7 +12,7 @@ from azure.ai.evaluation._common.constants import (
|
|
|
12
12
|
_InternalAnnotationTasks,
|
|
13
13
|
)
|
|
14
14
|
from azure.ai.evaluation._common.rai_service import evaluate_with_rai_service, evaluate_with_rai_service_multimodal
|
|
15
|
-
from azure.ai.evaluation._common.utils import validate_azure_ai_project
|
|
15
|
+
from azure.ai.evaluation._common.utils import validate_azure_ai_project, is_onedp_project
|
|
16
16
|
from azure.ai.evaluation._exceptions import EvaluationException
|
|
17
17
|
from azure.ai.evaluation._common.utils import validate_conversation
|
|
18
18
|
from azure.ai.evaluation._constants import _AggregationType
|
|
@@ -50,7 +50,7 @@ class RaiServiceEvaluatorBase(EvaluatorBase[T]):
|
|
|
50
50
|
def __init__(
|
|
51
51
|
self,
|
|
52
52
|
eval_metric: Union[EvaluationMetrics, _InternalEvaluationMetrics],
|
|
53
|
-
azure_ai_project: dict,
|
|
53
|
+
azure_ai_project: Union[dict, str],
|
|
54
54
|
credential: TokenCredential,
|
|
55
55
|
eval_last_turn: bool = False,
|
|
56
56
|
conversation_aggregation_type: _AggregationType = _AggregationType.MEAN,
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
# ---------------------------------------------------------
|
|
2
|
+
# Copyright (c) Microsoft Corporation. All rights reserved.
|
|
3
|
+
# ---------------------------------------------------------
|
|
4
|
+
|
|
5
|
+
from ._document_retrieval import DocumentRetrievalEvaluator, RetrievalGroundTruthDocument, RetrievedDocument
|
|
6
|
+
|
|
7
|
+
__all__ = [
|
|
8
|
+
"DocumentRetrievalEvaluator",
|
|
9
|
+
"RetrievalGroundTruthDocument",
|
|
10
|
+
"RetrievedDocument"
|
|
11
|
+
]
|