azure-ai-evaluation 1.0.0b1__py3-none-any.whl → 1.0.0b2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of azure-ai-evaluation might be problematic. Click here for more details.
- azure/ai/evaluation/__init__.py +1 -5
- azure/ai/evaluation/_common/rai_service.py +4 -4
- azure/ai/evaluation/_common/utils.py +19 -19
- azure/ai/evaluation/_constants.py +9 -0
- azure/ai/evaluation/_evaluate/_batch_run_client/batch_run_context.py +2 -1
- azure/ai/evaluation/_evaluate/_batch_run_client/code_client.py +39 -17
- azure/ai/evaluation/_evaluate/_batch_run_client/proxy_client.py +23 -13
- azure/ai/evaluation/_evaluate/_eval_run.py +38 -18
- azure/ai/evaluation/_evaluate/_evaluate.py +35 -28
- azure/ai/evaluation/_evaluate/_telemetry/__init__.py +13 -8
- azure/ai/evaluation/_evaluate/_utils.py +29 -22
- azure/ai/evaluation/_evaluators/_bleu/_bleu.py +1 -1
- azure/ai/evaluation/_evaluators/_chat/_chat.py +16 -9
- azure/ai/evaluation/_evaluators/_chat/retrieval/_retrieval.py +4 -10
- azure/ai/evaluation/_evaluators/_coherence/_coherence.py +5 -10
- azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py +0 -2
- azure/ai/evaluation/_evaluators/_content_safety/_content_safety_base.py +1 -2
- azure/ai/evaluation/_evaluators/_content_safety/_content_safety_chat.py +9 -4
- azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py +1 -1
- azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py +1 -1
- azure/ai/evaluation/_evaluators/_content_safety/_sexual.py +1 -1
- azure/ai/evaluation/_evaluators/_content_safety/_violence.py +1 -1
- azure/ai/evaluation/_evaluators/_eci/_eci.py +2 -2
- azure/ai/evaluation/_evaluators/_f1_score/_f1_score.py +2 -1
- azure/ai/evaluation/_evaluators/_fluency/_fluency.py +5 -10
- azure/ai/evaluation/_evaluators/_gleu/_gleu.py +1 -1
- azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +5 -10
- azure/ai/evaluation/_evaluators/_meteor/_meteor.py +1 -0
- azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py +2 -2
- azure/ai/evaluation/_evaluators/_protected_materials/_protected_materials.py +2 -2
- azure/ai/evaluation/_evaluators/_qa/_qa.py +3 -14
- azure/ai/evaluation/_evaluators/_relevance/_relevance.py +5 -10
- azure/ai/evaluation/_evaluators/_rouge/_rouge.py +3 -2
- azure/ai/evaluation/_evaluators/_similarity/_similarity.py +5 -10
- azure/ai/evaluation/_evaluators/_xpia/xpia.py +1 -2
- azure/ai/evaluation/_http_utils.py +3 -3
- azure/ai/evaluation/_version.py +1 -1
- azure/ai/evaluation/simulator/__init__.py +1 -1
- azure/ai/evaluation/simulator/_adversarial_simulator.py +8 -6
- azure/ai/evaluation/simulator/_conversation/__init__.py +1 -1
- azure/ai/evaluation/simulator/_conversation/_conversation.py +16 -16
- azure/ai/evaluation/simulator/_direct_attack_simulator.py +6 -6
- azure/ai/evaluation/simulator/_helpers/__init__.py +3 -2
- azure/ai/evaluation/simulator/_helpers/_experimental.py +157 -0
- azure/ai/evaluation/simulator/_helpers/_simulator_data_classes.py +11 -29
- azure/ai/evaluation/simulator/_indirect_attack_simulator.py +6 -6
- azure/ai/evaluation/simulator/_model_tools/_proxy_completion_model.py +2 -3
- azure/ai/evaluation/simulator/_model_tools/_rai_client.py +18 -11
- azure/ai/evaluation/simulator/_model_tools/_template_handler.py +1 -1
- azure/ai/evaluation/simulator/_model_tools/models.py +9 -11
- azure/ai/evaluation/simulator/_prompty/__init__.py +0 -0
- azure/ai/evaluation/simulator/{simulator.py → _simulator.py} +147 -80
- azure/ai/evaluation/simulator/_tracing.py +21 -24
- azure/ai/evaluation/simulator/_utils.py +4 -1
- {azure_ai_evaluation-1.0.0b1.dist-info → azure_ai_evaluation-1.0.0b2.dist-info}/METADATA +86 -14
- {azure_ai_evaluation-1.0.0b1.dist-info → azure_ai_evaluation-1.0.0b2.dist-info}/RECORD +58 -56
- {azure_ai_evaluation-1.0.0b1.dist-info → azure_ai_evaluation-1.0.0b2.dist-info}/WHEEL +0 -0
- {azure_ai_evaluation-1.0.0b1.dist-info → azure_ai_evaluation-1.0.0b2.dist-info}/top_level.txt +0 -0
azure/ai/evaluation/__init__.py
CHANGED
|
@@ -25,11 +25,7 @@ from ._evaluators._relevance import RelevanceEvaluator
|
|
|
25
25
|
from ._evaluators._rouge import RougeScoreEvaluator, RougeType
|
|
26
26
|
from ._evaluators._similarity import SimilarityEvaluator
|
|
27
27
|
from ._evaluators._xpia import IndirectAttackEvaluator
|
|
28
|
-
from ._model_configurations import
|
|
29
|
-
AzureAIProject,
|
|
30
|
-
AzureOpenAIModelConfiguration,
|
|
31
|
-
OpenAIModelConfiguration,
|
|
32
|
-
)
|
|
28
|
+
from ._model_configurations import AzureAIProject, AzureOpenAIModelConfiguration, OpenAIModelConfiguration
|
|
33
29
|
|
|
34
30
|
__all__ = [
|
|
35
31
|
"evaluate",
|
|
@@ -11,12 +11,12 @@ from urllib.parse import urlparse
|
|
|
11
11
|
|
|
12
12
|
import jwt
|
|
13
13
|
import numpy as np
|
|
14
|
-
from azure.core.credentials import TokenCredential
|
|
15
|
-
from azure.identity import DefaultAzureCredential
|
|
16
14
|
|
|
15
|
+
from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
|
|
17
16
|
from azure.ai.evaluation._http_utils import get_async_http_client
|
|
18
|
-
from azure.ai.evaluation._exceptions import EvaluationException, ErrorBlame, ErrorCategory, ErrorTarget
|
|
19
17
|
from azure.ai.evaluation._model_configurations import AzureAIProject
|
|
18
|
+
from azure.core.credentials import TokenCredential
|
|
19
|
+
from azure.identity import DefaultAzureCredential
|
|
20
20
|
|
|
21
21
|
from .constants import (
|
|
22
22
|
CommonConstants,
|
|
@@ -348,7 +348,7 @@ async def _get_service_discovery_url(azure_ai_project: AzureAIProject, token: st
|
|
|
348
348
|
)
|
|
349
349
|
|
|
350
350
|
if response.status_code != 200:
|
|
351
|
-
msg =
|
|
351
|
+
msg = "Failed to retrieve the discovery service URL."
|
|
352
352
|
raise EvaluationException(
|
|
353
353
|
message=msg,
|
|
354
354
|
internal_message=msg,
|
|
@@ -2,20 +2,15 @@
|
|
|
2
2
|
# Copyright (c) Microsoft Corporation. All rights reserved.
|
|
3
3
|
# ---------------------------------------------------------
|
|
4
4
|
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
from azure.ai.evaluation._model_configurations import AzureOpenAIModelConfiguration, OpenAIModelConfiguration
|
|
5
|
+
import threading
|
|
6
|
+
from typing import List, Optional, Union
|
|
8
7
|
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
except ImportError:
|
|
12
|
-
import constants
|
|
8
|
+
import nltk
|
|
9
|
+
import numpy as np
|
|
13
10
|
|
|
14
|
-
from
|
|
11
|
+
from azure.ai.evaluation._model_configurations import AzureOpenAIModelConfiguration, OpenAIModelConfiguration
|
|
15
12
|
|
|
16
|
-
import
|
|
17
|
-
import numpy as np
|
|
18
|
-
import nltk
|
|
13
|
+
from . import constants
|
|
19
14
|
|
|
20
15
|
_nltk_data_download_lock = threading.Lock()
|
|
21
16
|
|
|
@@ -46,7 +41,7 @@ def ensure_nltk_data_downloaded():
|
|
|
46
41
|
"""Download NLTK data packages if not already downloaded."""
|
|
47
42
|
with _nltk_data_download_lock:
|
|
48
43
|
try:
|
|
49
|
-
from nltk.tokenize.nist import NISTTokenizer
|
|
44
|
+
from nltk.tokenize.nist import NISTTokenizer # pylint: disable=unused-import
|
|
50
45
|
except LookupError:
|
|
51
46
|
nltk.download("perluniprops")
|
|
52
47
|
nltk.download("punkt")
|
|
@@ -54,12 +49,19 @@ def ensure_nltk_data_downloaded():
|
|
|
54
49
|
|
|
55
50
|
|
|
56
51
|
def nltk_tokenize(text: str) -> List[str]:
|
|
57
|
-
"""Tokenize the input text using the NLTK tokenizer.
|
|
52
|
+
"""Tokenize the input text using the NLTK tokenizer.
|
|
53
|
+
|
|
54
|
+
:param text: The text to tokenize
|
|
55
|
+
:type text: str
|
|
56
|
+
:return: A list of tokens
|
|
57
|
+
:rtype: list[str]
|
|
58
|
+
"""
|
|
58
59
|
ensure_nltk_data_downloaded()
|
|
59
60
|
|
|
60
61
|
if not text.isascii():
|
|
61
62
|
# Use NISTTokenizer for international tokenization
|
|
62
63
|
from nltk.tokenize.nist import NISTTokenizer
|
|
64
|
+
|
|
63
65
|
tokens = NISTTokenizer().international_tokenize(text)
|
|
64
66
|
else:
|
|
65
67
|
# By default, use NLTK word tokenizer
|
|
@@ -68,20 +70,18 @@ def nltk_tokenize(text: str) -> List[str]:
|
|
|
68
70
|
return list(tokens)
|
|
69
71
|
|
|
70
72
|
|
|
71
|
-
def
|
|
73
|
+
def ensure_api_version_in_aoai_model_config(
|
|
72
74
|
model_config: Union[AzureOpenAIModelConfiguration, OpenAIModelConfiguration],
|
|
73
75
|
default_api_version: str,
|
|
74
76
|
) -> None:
|
|
75
|
-
if
|
|
76
|
-
"azure_endpoint" in model_config or "azure_deployment" in model_config
|
|
77
|
-
):
|
|
77
|
+
if "azure_endpoint" in model_config or "azure_deployment" in model_config:
|
|
78
78
|
model_config["api_version"] = model_config.get("api_version", default_api_version)
|
|
79
79
|
|
|
80
80
|
|
|
81
|
-
def
|
|
81
|
+
def ensure_user_agent_in_aoai_model_config(
|
|
82
82
|
model_config: Union[AzureOpenAIModelConfiguration, OpenAIModelConfiguration],
|
|
83
83
|
prompty_model_config: dict,
|
|
84
84
|
user_agent: Optional[str] = None,
|
|
85
85
|
) -> None:
|
|
86
86
|
if user_agent and ("azure_endpoint" in model_config or "azure_deployment" in model_config):
|
|
87
|
-
prompty_model_config["parameters"]["extra_headers"].update({"x-ms-useragent": user_agent})
|
|
87
|
+
prompty_model_config["parameters"]["extra_headers"].update({"x-ms-useragent": user_agent})
|
|
@@ -39,6 +39,15 @@ class Prefixes:
|
|
|
39
39
|
TSG_OUTPUTS = "__outputs."
|
|
40
40
|
|
|
41
41
|
|
|
42
|
+
class DefaultOpenEncoding:
|
|
43
|
+
"""Enum that captures SDK's default values for the encoding param of open(...)"""
|
|
44
|
+
|
|
45
|
+
READ = "utf-8-sig"
|
|
46
|
+
"""SDK Default Encoding when reading a file"""
|
|
47
|
+
WRITE = "utf-8"
|
|
48
|
+
"""SDK Default Encoding when writing a file"""
|
|
49
|
+
|
|
50
|
+
|
|
42
51
|
DEFAULT_EVALUATION_RESULTS_FILE_NAME = "evaluation_results.json"
|
|
43
52
|
|
|
44
53
|
CONTENT_SAFETY_DEFECT_RATE_THRESHOLD_DEFAULT = 4
|
|
@@ -5,13 +5,14 @@ import os
|
|
|
5
5
|
|
|
6
6
|
from promptflow._sdk._constants import PF_FLOW_ENTRY_IN_TMP, PF_FLOW_META_LOAD_IN_SUBPROCESS
|
|
7
7
|
from promptflow._utils.user_agent_utils import ClientUserAgentUtil
|
|
8
|
+
from promptflow.tracing._integrations._openai_injector import inject_openai_api, recover_openai_api
|
|
9
|
+
|
|
8
10
|
from azure.ai.evaluation._constants import (
|
|
9
11
|
OTEL_EXPORTER_OTLP_TRACES_TIMEOUT,
|
|
10
12
|
OTEL_EXPORTER_OTLP_TRACES_TIMEOUT_DEFAULT,
|
|
11
13
|
PF_BATCH_TIMEOUT_SEC,
|
|
12
14
|
PF_BATCH_TIMEOUT_SEC_DEFAULT,
|
|
13
15
|
)
|
|
14
|
-
from promptflow.tracing._integrations._openai_injector import inject_openai_api, recover_openai_api
|
|
15
16
|
|
|
16
17
|
from ..._user_agent import USER_AGENT
|
|
17
18
|
from .._utils import set_event_loop_policy
|
|
@@ -4,13 +4,16 @@
|
|
|
4
4
|
import inspect
|
|
5
5
|
import json
|
|
6
6
|
import logging
|
|
7
|
+
import os
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
from typing import Callable, Dict, Optional, Union
|
|
7
10
|
|
|
8
11
|
import pandas as pd
|
|
9
|
-
|
|
10
12
|
from promptflow.contracts.types import AttrDict
|
|
11
|
-
from azure.ai.evaluation._evaluate._utils import _apply_column_mapping, _has_aggregator, get_int_env_var, load_jsonl
|
|
12
13
|
from promptflow.tracing import ThreadPoolExecutorWithContext as ThreadPoolExecutor
|
|
13
|
-
|
|
14
|
+
|
|
15
|
+
from azure.ai.evaluation._evaluate._utils import _apply_column_mapping, _has_aggregator, get_int_env_var, load_jsonl
|
|
16
|
+
from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
|
|
14
17
|
|
|
15
18
|
from ..._constants import PF_BATCH_TIMEOUT_SEC, PF_BATCH_TIMEOUT_SEC_DEFAULT
|
|
16
19
|
|
|
@@ -18,7 +21,9 @@ LOGGER = logging.getLogger(__name__)
|
|
|
18
21
|
|
|
19
22
|
|
|
20
23
|
class CodeRun:
|
|
21
|
-
def __init__(
|
|
24
|
+
def __init__(
|
|
25
|
+
self, run, input_data, evaluator_name=None, aggregated_metrics=None, **kwargs # pylint: disable=unused-argument
|
|
26
|
+
):
|
|
22
27
|
self.run = run
|
|
23
28
|
self.evaluator_name = evaluator_name if evaluator_name is not None else ""
|
|
24
29
|
self.input_data = input_data
|
|
@@ -40,13 +45,13 @@ class CodeRun:
|
|
|
40
45
|
else None
|
|
41
46
|
)
|
|
42
47
|
except Exception as ex: # pylint: disable=broad-exception-caught
|
|
43
|
-
LOGGER.debug(
|
|
48
|
+
LOGGER.debug("Error calculating metrics for evaluator %s, failed with error %s", self.evaluator_name, ex)
|
|
44
49
|
aggregated_metrics = None
|
|
45
50
|
|
|
46
51
|
if not isinstance(aggregated_metrics, dict):
|
|
47
52
|
LOGGER.warning(
|
|
48
|
-
|
|
49
|
-
|
|
53
|
+
"Aggregated metrics for evaluator %s is not a dictionary will not be logged as metrics",
|
|
54
|
+
self.evaluator_name,
|
|
50
55
|
)
|
|
51
56
|
|
|
52
57
|
aggregated_metrics = aggregated_metrics if isinstance(aggregated_metrics, dict) else {}
|
|
@@ -54,11 +59,15 @@ class CodeRun:
|
|
|
54
59
|
return aggregated_metrics
|
|
55
60
|
|
|
56
61
|
|
|
57
|
-
class CodeClient:
|
|
58
|
-
def __init__(
|
|
62
|
+
class CodeClient: # pylint: disable=client-accepts-api-version-keyword
|
|
63
|
+
def __init__( # pylint: disable=missing-client-constructor-parameter-credential,missing-client-constructor-parameter-kwargs
|
|
64
|
+
self,
|
|
65
|
+
) -> None:
|
|
59
66
|
self._thread_pool = ThreadPoolExecutor(thread_name_prefix="evaluators_thread")
|
|
60
67
|
|
|
61
|
-
def _calculate_metric(
|
|
68
|
+
def _calculate_metric(
|
|
69
|
+
self, evaluator: Callable, input_df: pd.DataFrame, column_mapping: Optional[Dict[str, str]], evaluator_name: str
|
|
70
|
+
) -> pd.DataFrame:
|
|
62
71
|
row_metric_futures = []
|
|
63
72
|
row_metric_results = []
|
|
64
73
|
input_df = _apply_column_mapping(input_df, column_mapping)
|
|
@@ -110,18 +119,25 @@ class CodeClient:
|
|
|
110
119
|
return aggregated_output
|
|
111
120
|
except Exception as ex: # pylint: disable=broad-exception-caught
|
|
112
121
|
LOGGER.warning(
|
|
113
|
-
|
|
122
|
+
"Error calculating aggregations for evaluator %s, failed with error %s", run.evaluator_name, ex
|
|
114
123
|
)
|
|
115
124
|
return None
|
|
116
125
|
|
|
117
|
-
def run(
|
|
126
|
+
def run(
|
|
127
|
+
self, # pylint: disable=unused-argument
|
|
128
|
+
flow: Callable,
|
|
129
|
+
data: Union[os.PathLike, Path, pd.DataFrame],
|
|
130
|
+
evaluator_name: Optional[str] = None,
|
|
131
|
+
column_mapping: Optional[Dict[str, str]] = None,
|
|
132
|
+
**kwargs,
|
|
133
|
+
) -> CodeRun:
|
|
118
134
|
input_df = data
|
|
119
135
|
if not isinstance(input_df, pd.DataFrame):
|
|
120
136
|
try:
|
|
121
137
|
json_data = load_jsonl(data)
|
|
122
138
|
except json.JSONDecodeError as exc:
|
|
123
139
|
raise EvaluationException(
|
|
124
|
-
message
|
|
140
|
+
message=f"Failed to parse data as JSON: {data}. Provide valid json lines data.",
|
|
125
141
|
internal_message="Failed to parse data as JSON",
|
|
126
142
|
target=ErrorTarget.CODE_CLIENT,
|
|
127
143
|
category=ErrorCategory.INVALID_VALUE,
|
|
@@ -129,22 +145,28 @@ class CodeClient:
|
|
|
129
145
|
) from exc
|
|
130
146
|
|
|
131
147
|
input_df = pd.DataFrame(json_data)
|
|
132
|
-
eval_future = self._thread_pool.submit(
|
|
148
|
+
eval_future = self._thread_pool.submit(
|
|
149
|
+
self._calculate_metric,
|
|
150
|
+
evaluator=flow,
|
|
151
|
+
input_df=input_df,
|
|
152
|
+
column_mapping=column_mapping,
|
|
153
|
+
evaluator_name=evaluator_name,
|
|
154
|
+
)
|
|
133
155
|
run = CodeRun(run=eval_future, input_data=data, evaluator_name=evaluator_name, aggregated_metrics=None)
|
|
134
156
|
aggregation_future = self._thread_pool.submit(self._calculate_aggregations, evaluator=flow, run=run)
|
|
135
157
|
run.aggregated_metrics = aggregation_future
|
|
136
158
|
return run
|
|
137
159
|
|
|
138
|
-
def get_details(self, run, all_results=False):
|
|
160
|
+
def get_details(self, run: CodeRun, all_results: bool = False) -> pd.DataFrame:
|
|
139
161
|
result_df = run.get_result_df(exclude_inputs=not all_results)
|
|
140
162
|
return result_df
|
|
141
163
|
|
|
142
|
-
def get_metrics(self, run):
|
|
164
|
+
def get_metrics(self, run: CodeRun) -> Optional[None]:
|
|
143
165
|
try:
|
|
144
166
|
aggregated_metrics = run.get_aggregated_metrics()
|
|
145
167
|
print("Aggregated metrics")
|
|
146
168
|
print(aggregated_metrics)
|
|
147
169
|
except Exception as ex: # pylint: disable=broad-exception-caught
|
|
148
|
-
LOGGER.debug(
|
|
170
|
+
LOGGER.debug("Error calculating metrics for evaluator %s, failed with error %s", run.evaluator_name, ex)
|
|
149
171
|
return None
|
|
150
172
|
return aggregated_metrics
|
|
@@ -4,29 +4,40 @@
|
|
|
4
4
|
import inspect
|
|
5
5
|
import logging
|
|
6
6
|
import os
|
|
7
|
+
from concurrent.futures import Future
|
|
8
|
+
from typing import Any, Callable, Dict, Optional, Union
|
|
7
9
|
|
|
8
10
|
import numpy as np
|
|
9
|
-
|
|
11
|
+
import pandas as pd
|
|
10
12
|
from promptflow.client import PFClient
|
|
13
|
+
from promptflow.entities import Run
|
|
11
14
|
from promptflow.tracing import ThreadPoolExecutorWithContext as ThreadPoolExecutor
|
|
12
15
|
|
|
13
16
|
LOGGER = logging.getLogger(__name__)
|
|
14
17
|
|
|
15
18
|
|
|
16
19
|
class ProxyRun:
|
|
17
|
-
def __init__(self, run, **kwargs):
|
|
20
|
+
def __init__(self, run: Future, **kwargs) -> None: # pylint: disable=unused-argument
|
|
18
21
|
self.run = run
|
|
19
22
|
|
|
20
23
|
|
|
21
|
-
class ProxyClient:
|
|
22
|
-
def __init__(
|
|
24
|
+
class ProxyClient: # pylint: disable=client-accepts-api-version-keyword
|
|
25
|
+
def __init__( # pylint: disable=missing-client-constructor-parameter-credential,missing-client-constructor-parameter-kwargs
|
|
26
|
+
self, pf_client: PFClient
|
|
27
|
+
) -> None:
|
|
23
28
|
self._pf_client = pf_client
|
|
24
29
|
self._thread_pool = ThreadPoolExecutor(thread_name_prefix="evaluators_thread")
|
|
25
30
|
|
|
26
|
-
def run(
|
|
31
|
+
def run(
|
|
32
|
+
self,
|
|
33
|
+
flow: Union[str, os.PathLike, Callable],
|
|
34
|
+
data: Union[str, os.PathLike],
|
|
35
|
+
column_mapping: Optional[Dict[str, str]] = None,
|
|
36
|
+
**kwargs
|
|
37
|
+
) -> ProxyRun:
|
|
27
38
|
flow_to_run = flow
|
|
28
39
|
if hasattr(flow, "_to_async"):
|
|
29
|
-
flow_to_run = flow._to_async()
|
|
40
|
+
flow_to_run = flow._to_async() # pylint: disable=protected-access
|
|
30
41
|
|
|
31
42
|
batch_use_async = self._should_batch_use_async(flow_to_run)
|
|
32
43
|
eval_future = self._thread_pool.submit(
|
|
@@ -39,14 +50,14 @@ class ProxyClient:
|
|
|
39
50
|
)
|
|
40
51
|
return ProxyRun(run=eval_future)
|
|
41
52
|
|
|
42
|
-
def get_details(self, proxy_run, all_results=False):
|
|
43
|
-
run = proxy_run.run.result()
|
|
53
|
+
def get_details(self, proxy_run: ProxyRun, all_results: bool = False) -> pd.DataFrame:
|
|
54
|
+
run: Run = proxy_run.run.result()
|
|
44
55
|
result_df = self._pf_client.get_details(run, all_results=all_results)
|
|
45
56
|
result_df.replace("(Failed)", np.nan, inplace=True)
|
|
46
57
|
return result_df
|
|
47
58
|
|
|
48
|
-
def get_metrics(self, proxy_run):
|
|
49
|
-
run = proxy_run.run.result()
|
|
59
|
+
def get_metrics(self, proxy_run: ProxyRun) -> Dict[str, Any]:
|
|
60
|
+
run: Run = proxy_run.run.result()
|
|
50
61
|
return self._pf_client.get_metrics(run)
|
|
51
62
|
|
|
52
63
|
@staticmethod
|
|
@@ -54,8 +65,7 @@ class ProxyClient:
|
|
|
54
65
|
if os.getenv("PF_EVALS_BATCH_USE_ASYNC", "true").lower() == "true":
|
|
55
66
|
if hasattr(flow, "__call__") and inspect.iscoroutinefunction(flow.__call__):
|
|
56
67
|
return True
|
|
57
|
-
|
|
68
|
+
if inspect.iscoroutinefunction(flow):
|
|
58
69
|
return True
|
|
59
|
-
|
|
60
|
-
return False
|
|
70
|
+
return False
|
|
61
71
|
return False
|
|
@@ -8,17 +8,18 @@ import logging
|
|
|
8
8
|
import os
|
|
9
9
|
import posixpath
|
|
10
10
|
import time
|
|
11
|
+
import types
|
|
11
12
|
import uuid
|
|
12
|
-
from typing import Any, Dict, Optional, Set
|
|
13
|
+
from typing import Any, Dict, Optional, Set, Type
|
|
13
14
|
from urllib.parse import urlparse
|
|
14
15
|
|
|
15
|
-
from azure.core.pipeline.policies import RetryPolicy
|
|
16
|
-
from azure.core.rest import HttpResponse
|
|
17
|
-
|
|
18
16
|
from promptflow._sdk.entities import Run
|
|
17
|
+
|
|
18
|
+
from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
|
|
19
19
|
from azure.ai.evaluation._http_utils import get_http_client
|
|
20
20
|
from azure.ai.evaluation._version import VERSION
|
|
21
|
-
from azure.
|
|
21
|
+
from azure.core.pipeline.policies import RetryPolicy
|
|
22
|
+
from azure.core.rest import HttpResponse
|
|
22
23
|
|
|
23
24
|
LOGGER = logging.getLogger(__name__)
|
|
24
25
|
|
|
@@ -165,7 +166,9 @@ class EvalRun(contextlib.AbstractContextManager): # pylint: disable=too-many-in
|
|
|
165
166
|
self._url_base = urlparse(self._tracking_uri).netloc
|
|
166
167
|
if self._promptflow_run is not None:
|
|
167
168
|
self.info = RunInfo(
|
|
168
|
-
self._promptflow_run.name,
|
|
169
|
+
self._promptflow_run.name,
|
|
170
|
+
self._promptflow_run._experiment_name, # pylint: disable=protected-access
|
|
171
|
+
self._promptflow_run.name,
|
|
169
172
|
)
|
|
170
173
|
else:
|
|
171
174
|
url = f"https://{self._url_base}/mlflow/v2.0" f"{self._get_scope()}/api/2.0/mlflow/runs/create"
|
|
@@ -181,8 +184,10 @@ class EvalRun(contextlib.AbstractContextManager): # pylint: disable=too-many-in
|
|
|
181
184
|
if response.status_code != 200:
|
|
182
185
|
self.info = RunInfo.generate(self._run_name)
|
|
183
186
|
LOGGER.warning(
|
|
184
|
-
|
|
185
|
-
"The results will be saved locally, but will not be logged to Azure."
|
|
187
|
+
"The run failed to start: %s: %s."
|
|
188
|
+
"The results will be saved locally, but will not be logged to Azure.",
|
|
189
|
+
response.status_code,
|
|
190
|
+
response.text(),
|
|
186
191
|
)
|
|
187
192
|
self._status = RunStatus.BROKEN
|
|
188
193
|
else:
|
|
@@ -216,7 +221,7 @@ class EvalRun(contextlib.AbstractContextManager): # pylint: disable=too-many-in
|
|
|
216
221
|
internal_message="Incorrect terminal status. Valid statuses are 'FINISHED', 'FAILED' and 'KILLED'",
|
|
217
222
|
target=ErrorTarget.EVAL_RUN,
|
|
218
223
|
category=ErrorCategory.FAILED_EXECUTION,
|
|
219
|
-
blame=ErrorBlame.UNKNOWN
|
|
224
|
+
blame=ErrorBlame.UNKNOWN,
|
|
220
225
|
)
|
|
221
226
|
url = f"https://{self._url_base}/mlflow/v2.0" f"{self._get_scope()}/api/2.0/mlflow/runs/update"
|
|
222
227
|
body = {
|
|
@@ -239,8 +244,21 @@ class EvalRun(contextlib.AbstractContextManager): # pylint: disable=too-many-in
|
|
|
239
244
|
self._start_run()
|
|
240
245
|
return self
|
|
241
246
|
|
|
242
|
-
def __exit__(
|
|
243
|
-
|
|
247
|
+
def __exit__(
|
|
248
|
+
self,
|
|
249
|
+
exc_type: Optional[Type[BaseException]],
|
|
250
|
+
exc_value: Optional[BaseException],
|
|
251
|
+
exc_tb: Optional[types.TracebackType],
|
|
252
|
+
) -> Optional[bool]:
|
|
253
|
+
"""The context manager exit call.
|
|
254
|
+
|
|
255
|
+
:param exc_type: The exception type
|
|
256
|
+
:type exc_type: Optional[Type[BaseException]]
|
|
257
|
+
:param exc_value: The exception value
|
|
258
|
+
:type exc_value: Optional[BaseException]
|
|
259
|
+
:param exc_tb: The exception traceback
|
|
260
|
+
:type exc_tb: Optional[types.TracebackType]
|
|
261
|
+
"""
|
|
244
262
|
self._end_run("FINISHED")
|
|
245
263
|
|
|
246
264
|
def get_run_history_uri(self) -> str:
|
|
@@ -280,7 +298,7 @@ class EvalRun(contextlib.AbstractContextManager): # pylint: disable=too-many-in
|
|
|
280
298
|
# is an optional dependency.
|
|
281
299
|
from promptflow.azure._utils._token_cache import ArmTokenCache # pylint: disable=import-error,no-name-in-module
|
|
282
300
|
|
|
283
|
-
return ArmTokenCache().get_token(self._ml_client._credential)
|
|
301
|
+
return ArmTokenCache().get_token(self._ml_client._credential) # pylint: disable=protected-access
|
|
284
302
|
|
|
285
303
|
def request_with_retry(
|
|
286
304
|
self, url: str, method: str, json_dict: Dict[str, Any], headers: Optional[Dict[str, str]] = None
|
|
@@ -326,9 +344,10 @@ class EvalRun(contextlib.AbstractContextManager): # pylint: disable=too-many-in
|
|
|
326
344
|
:type response: HttpResponse
|
|
327
345
|
"""
|
|
328
346
|
LOGGER.warning(
|
|
329
|
-
|
|
330
|
-
|
|
331
|
-
|
|
347
|
+
"Unable to %s, the request failed with status code %s, response.text()=%s.",
|
|
348
|
+
failed_op,
|
|
349
|
+
response.status_code,
|
|
350
|
+
response.text(),
|
|
332
351
|
)
|
|
333
352
|
|
|
334
353
|
def _check_state_and_log(self, action: str, bad_states: Set[RunStatus], should_raise: bool) -> bool:
|
|
@@ -342,7 +361,8 @@ class EvalRun(contextlib.AbstractContextManager): # pylint: disable=too-many-in
|
|
|
342
361
|
:type bad_states: Set[RunStatus]
|
|
343
362
|
:param should_raise: Should we raise an error if the bad state has been encountered
|
|
344
363
|
:type should_raise: bool
|
|
345
|
-
:raises: ~azure.ai.evaluations._exceptions.EvaluationException if should_raise is True
|
|
364
|
+
:raises: ~azure.ai.evaluations._exceptions.EvaluationException if should_raise is True
|
|
365
|
+
and invalid state was encountered.
|
|
346
366
|
:return: Whether or not run is in the correct state.
|
|
347
367
|
:rtype: bool
|
|
348
368
|
"""
|
|
@@ -354,7 +374,7 @@ class EvalRun(contextlib.AbstractContextManager): # pylint: disable=too-many-in
|
|
|
354
374
|
internal_message=msg,
|
|
355
375
|
target=ErrorTarget.EVAL_RUN,
|
|
356
376
|
category=ErrorCategory.FAILED_EXECUTION,
|
|
357
|
-
blame=ErrorBlame.UNKNOWN
|
|
377
|
+
blame=ErrorBlame.UNKNOWN,
|
|
358
378
|
)
|
|
359
379
|
LOGGER.warning(msg)
|
|
360
380
|
return False
|
|
@@ -446,7 +466,7 @@ class EvalRun(contextlib.AbstractContextManager): # pylint: disable=too-many-in
|
|
|
446
466
|
return credential.account_key
|
|
447
467
|
if hasattr(credential, "sas_token"):
|
|
448
468
|
return credential.sas_token
|
|
449
|
-
return self._ml_client.datastores._credential
|
|
469
|
+
return self._ml_client.datastores._credential # pylint: disable=protected-access
|
|
450
470
|
|
|
451
471
|
def log_metric(self, key: str, value: float) -> None:
|
|
452
472
|
"""
|
|
@@ -8,27 +8,26 @@ from typing import Any, Callable, Dict, List, Optional, Set, Tuple, Type
|
|
|
8
8
|
|
|
9
9
|
import numpy as np
|
|
10
10
|
import pandas as pd
|
|
11
|
-
|
|
12
11
|
from promptflow._sdk._constants import LINE_NUMBER
|
|
13
12
|
from promptflow.client import PFClient
|
|
14
13
|
|
|
15
|
-
from
|
|
14
|
+
from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
|
|
15
|
+
|
|
16
16
|
from .._constants import (
|
|
17
17
|
CONTENT_SAFETY_DEFECT_RATE_THRESHOLD_DEFAULT,
|
|
18
18
|
EvaluationMetrics,
|
|
19
19
|
Prefixes,
|
|
20
20
|
_InternalEvaluationMetrics,
|
|
21
21
|
)
|
|
22
|
+
from .._model_configurations import AzureAIProject
|
|
22
23
|
from .._user_agent import USER_AGENT
|
|
23
24
|
from ._batch_run_client import BatchRunContext, CodeClient, ProxyClient
|
|
24
|
-
from ._telemetry import log_evaluate_activity
|
|
25
25
|
from ._utils import (
|
|
26
26
|
_apply_column_mapping,
|
|
27
27
|
_log_metrics_and_instance_results,
|
|
28
28
|
_trace_destination_from_project_scope,
|
|
29
29
|
_write_output,
|
|
30
30
|
)
|
|
31
|
-
from azure.ai.evaluation._exceptions import EvaluationException, ErrorBlame, ErrorCategory, ErrorTarget
|
|
32
31
|
|
|
33
32
|
|
|
34
33
|
# pylint: disable=line-too-long
|
|
@@ -260,12 +259,12 @@ def _validate_and_load_data(target, data, evaluators, output_path, azure_ai_proj
|
|
|
260
259
|
initial_data_df = pd.read_json(data, lines=True)
|
|
261
260
|
except Exception as e:
|
|
262
261
|
raise EvaluationException(
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
262
|
+
message=f"Failed to load data from {data}. Confirm that it is valid jsonl data. Error: {str(e)}.",
|
|
263
|
+
internal_message="Failed to load data. Confirm that it is valid jsonl data.",
|
|
264
|
+
target=ErrorTarget.EVALUATE,
|
|
265
|
+
category=ErrorCategory.INVALID_VALUE,
|
|
266
|
+
blame=ErrorBlame.USER_ERROR,
|
|
267
|
+
) from e
|
|
269
268
|
|
|
270
269
|
return initial_data_df
|
|
271
270
|
|
|
@@ -436,10 +435,10 @@ def _rename_columns_conditionally(df: pd.DataFrame) -> pd.DataFrame:
|
|
|
436
435
|
# @log_evaluate_activity
|
|
437
436
|
def evaluate(
|
|
438
437
|
*,
|
|
438
|
+
data: str,
|
|
439
|
+
evaluators: Dict[str, Callable],
|
|
439
440
|
evaluation_name: Optional[str] = None,
|
|
440
441
|
target: Optional[Callable] = None,
|
|
441
|
-
data: Optional[str] = None,
|
|
442
|
-
evaluators: Optional[Dict[str, Callable]] = None,
|
|
443
442
|
evaluator_config: Optional[Dict[str, Dict[str, str]]] = None,
|
|
444
443
|
azure_ai_project: Optional[AzureAIProject] = None,
|
|
445
444
|
output_path: Optional[str] = None,
|
|
@@ -448,16 +447,16 @@ def evaluate(
|
|
|
448
447
|
"""Evaluates target or data with built-in or custom evaluators. If both target and data are provided,
|
|
449
448
|
data will be run through target function and then results will be evaluated.
|
|
450
449
|
|
|
450
|
+
:keyword data: Path to the data to be evaluated or passed to target if target is set.
|
|
451
|
+
Only .jsonl format files are supported. `target` and `data` both cannot be None. Required.
|
|
452
|
+
:paramtype data: str
|
|
453
|
+
:keyword evaluators: Evaluators to be used for evaluation. It should be a dictionary with key as alias for evaluator
|
|
454
|
+
and value as the evaluator function. Required.
|
|
455
|
+
:paramtype evaluators: Dict[str, Callable]
|
|
451
456
|
:keyword evaluation_name: Display name of the evaluation.
|
|
452
457
|
:paramtype evaluation_name: Optional[str]
|
|
453
458
|
:keyword target: Target to be evaluated. `target` and `data` both cannot be None
|
|
454
459
|
:paramtype target: Optional[Callable]
|
|
455
|
-
:keyword data: Path to the data to be evaluated or passed to target if target is set.
|
|
456
|
-
Only .jsonl format files are supported. `target` and `data` both cannot be None
|
|
457
|
-
:paramtype data: Optional[str]
|
|
458
|
-
:keyword evaluators: Evaluators to be used for evaluation. It should be a dictionary with key as alias for evaluator
|
|
459
|
-
and value as the evaluator function.
|
|
460
|
-
:paramtype evaluators: Optional[Dict[str, Callable]
|
|
461
460
|
:keyword evaluator_config: Configuration for evaluators. The configuration should be a dictionary with evaluator
|
|
462
461
|
names as keys and a dictionary of column mappings as values. The column mappings should be a dictionary with
|
|
463
462
|
keys as the column names in the evaluator input and values as the column names in the input data or data
|
|
@@ -572,22 +571,21 @@ def _evaluate( # pylint: disable=too-many-locals
|
|
|
572
571
|
user_agent=USER_AGENT,
|
|
573
572
|
)
|
|
574
573
|
|
|
575
|
-
trace_destination = pf_client._config.get_trace_destination()
|
|
576
|
-
|
|
574
|
+
trace_destination = pf_client._config.get_trace_destination() # pylint: disable=protected-access
|
|
577
575
|
target_run = None
|
|
578
|
-
|
|
579
576
|
target_generated_columns = set()
|
|
577
|
+
|
|
578
|
+
# Create default configuration for evaluators that directly maps
|
|
579
|
+
# input data names to keyword inputs of the same name in the evaluators.
|
|
580
|
+
evaluator_config = evaluator_config or {}
|
|
581
|
+
evaluator_config.setdefault("default", {})
|
|
582
|
+
|
|
583
|
+
# If target is set, apply 1-1 column mapping from target outputs to evaluator inputs
|
|
580
584
|
if data is not None and target is not None:
|
|
581
585
|
input_data_df, target_generated_columns, target_run = _apply_target_to_data(
|
|
582
586
|
target, data, pf_client, input_data_df, evaluation_name, _run_name=kwargs.get("_run_name")
|
|
583
587
|
)
|
|
584
588
|
|
|
585
|
-
# Make sure, the default is always in the configuration.
|
|
586
|
-
if not evaluator_config:
|
|
587
|
-
evaluator_config = {}
|
|
588
|
-
if "default" not in evaluator_config:
|
|
589
|
-
evaluator_config["default"] = {}
|
|
590
|
-
|
|
591
589
|
for evaluator_name, mapping in evaluator_config.items():
|
|
592
590
|
mapped_to_values = set(mapping.values())
|
|
593
591
|
for col in target_generated_columns:
|
|
@@ -604,6 +602,16 @@ def _evaluate( # pylint: disable=too-many-locals
|
|
|
604
602
|
# everything we need for evaluators.
|
|
605
603
|
_validate_columns(input_data_df, evaluators, target=None, evaluator_config=evaluator_config)
|
|
606
604
|
|
|
605
|
+
# Apply 1-1 mapping from input data to evaluator inputs, excluding values already assigned
|
|
606
|
+
# via target mapping.
|
|
607
|
+
# If both the data and the output dictionary of the target function
|
|
608
|
+
# have the same column, then the target function value is used.
|
|
609
|
+
if input_data_df is not None:
|
|
610
|
+
for col in input_data_df.columns:
|
|
611
|
+
# Ignore columns added by target mapping. These are formatted as "__outputs.<column_name>"
|
|
612
|
+
# Also ignore columns that are already in config, since they've been covered by target mapping.
|
|
613
|
+
if not col.startswith(Prefixes.TSG_OUTPUTS) and col not in evaluator_config["default"].keys():
|
|
614
|
+
evaluator_config["default"][col] = f"${{data.{col}}}"
|
|
607
615
|
# Batch Run
|
|
608
616
|
evaluators_info = {}
|
|
609
617
|
use_pf_client = kwargs.get("_use_pf_client", True)
|
|
@@ -672,7 +680,6 @@ def _evaluate( # pylint: disable=too-many-locals
|
|
|
672
680
|
result_df = pd.concat([input_data_df, evaluators_result_df], axis=1, verify_integrity=True)
|
|
673
681
|
metrics = _aggregate_metrics(evaluators_result_df, evaluators)
|
|
674
682
|
metrics.update(evaluators_metric)
|
|
675
|
-
|
|
676
683
|
studio_url = _log_metrics_and_instance_results(
|
|
677
684
|
metrics,
|
|
678
685
|
result_df,
|